blob: 3cbc34fbb6f3f4c4c8c44aa32198a11fddf70ae1 [file] [log] [blame]
Eliza Margarethad28469f2014-03-10 12:42:21 +00001package de.ids_mannheim.korap.index;
2
margaretha4f995582015-12-14 14:14:34 +01003import static org.junit.Assert.assertEquals;
Eliza Margarethad28469f2014-03-10 12:42:21 +00004
5import java.io.IOException;
6
7import org.apache.lucene.index.Term;
8import org.apache.lucene.search.spans.SpanOrQuery;
9import org.apache.lucene.search.spans.SpanQuery;
10import org.apache.lucene.search.spans.SpanTermQuery;
11import org.junit.Test;
12
Nils Diewalda14ecd62015-02-26 21:00:20 +000013import de.ids_mannheim.korap.KrillIndex;
Eliza Margarethad28469f2014-03-10 12:42:21 +000014import de.ids_mannheim.korap.query.SpanNextQuery;
Eliza Margarethad4693462014-03-17 13:16:18 +000015import de.ids_mannheim.korap.query.SpanRepetitionQuery;
Eliza Margaretha6517c502016-10-10 18:19:24 +020016import de.ids_mannheim.korap.response.Match;
margaretha4f995582015-12-14 14:14:34 +010017import de.ids_mannheim.korap.response.Result;
Eliza Margarethad28469f2014-03-10 12:42:21 +000018
Eliza Margarethad4693462014-03-17 13:16:18 +000019public class TestRepetitionIndex {
Eliza Margarethad28469f2014-03-10 12:42:21 +000020
Nils Diewaldbb33da22015-03-04 16:24:25 +000021 private KrillIndex ki;
22 private Result kr;
23
24
25 private FieldDocument createFieldDoc0 () {
26 FieldDocument fd = new FieldDocument();
Eliza Margarethad28469f2014-03-10 12:42:21 +000027 fd.addString("ID", "doc-0");
Eliza Margaretha6f989202016-10-14 21:48:29 +020028 fd.addTV("base", "text",
29 "[(0-1)s:c|_1$<i>0<i>1]" + "[(1-2)s:e|_2$<i>1<i>2]"
30 + "[(2-3)s:c|_3$<i>2<i>3|<>:y$<b>64<i>2<i>4<i>4<b>0]"
31 + "[(3-4)s:c|s:b|_4$<i>3<i>4|<>:x$<b>64<i>3<i>7<i>7<b>0]"
32 + "[(4-5)s:e|s:d|_5$<i>4<i>5|<>:y$<b>64<i>4<i>6<i>6<b>0]"
33 + "[(5-6)s:c|_6$<i>5<i>6|<>:y$<b>64<i>5<i>8<i>8]"
34 + "[(6-7)s:d|_7$<i>6<i>7<b>0]"
35 + "[(7-8)s:e|_8$<i>7<i>8|<>:x$<b>64<i>7<i>9<i>9<b>0]"
36 + "[(8-9)s:e|s:b|_9$<i>8<i>9|<>:x$<b>64<i>8<i>10<i>10<b>0]"
37 + "[(9-10)s:d|_10$<i>9<i>10]");
Eliza Margarethad28469f2014-03-10 12:42:21 +000038 return fd;
39 }
Eliza Margaretha7788a982014-08-29 16:10:52 +000040
Nils Diewaldbb33da22015-03-04 16:24:25 +000041
42 private FieldDocument createFieldDoc1 () {
43 FieldDocument fd = new FieldDocument();
44 fd.addString("ID", "doc-1");
margaretha4f995582015-12-14 14:14:34 +010045 fd.addTV("base", "text", "[(0-1)s:b|_1$<i>0<i>1]"
46 + "[(1-2)s:e|_2$<i>1<i>2]" + "[(2-3)s:c|_3$<i>2<i>3]"
47 + "[(3-4)s:c|s:d]" + "[(4-5)s:d|s:c|_5$<i>4<i>5]"
48 + "[(5-6)s:e|s:c|_6$<i>5<i>6]" + "[(6-7)s:e|_7$<i>6<i>7]"
49 + "[(7-8)s:c|_8$<i>7<i>8]" + "[(8-9)s:d|_9$<i>8<i>9]"
50 + "[(9-10)s:d|_10$<i>9<i>10]");
Nils Diewaldbb33da22015-03-04 16:24:25 +000051 return fd;
52 }
53
54
55 private FieldDocument createFieldDoc2 () {
56 FieldDocument fd = new FieldDocument();
57 fd.addString("ID", "doc-2");
margaretha4f995582015-12-14 14:14:34 +010058 fd.addTV("base", "text",
59 "[(0-1)s:b|s:c|_1$<i>0<i>1|<>:s$<b>64<i>0<i>2<i>1<b>0]"
60 + "[(1-2)s:c|_2$<i>1<i>2]"
61 + "[(2-3)s:b|_3$<i>2<i>3|<>:s$<b>64<i>2<i>3<i>3<b>0]"
62 + "[(3-4)s:c|_4$<i>3<i>4|<>:s$<b>64<i>3<i>4<i>4<b>0]"
63 + "[(4-5)s:c|_5$<i>4<i>5|<>:s$<b>64<i>4<i>5<i>5]"
64 + "[(5-6)s:b|_6$<i>5<i>6<b>0]"
65 + "[(6-7)s:c|_7$<i>6<i>7|<>:s$<b>64<i>6<i>7<i>7<b>0]");
Nils Diewaldbb33da22015-03-04 16:24:25 +000066 return fd;
67 }
68
69
70 private FieldDocument createFieldDoc3 () {
71 FieldDocument fd = new FieldDocument();
72 fd.addString("ID", "doc-3");
margaretha4f995582015-12-14 14:14:34 +010073 fd.addTV("base", "text",
74 "[(0-1)s:a|_1$<i>0<i>1|<>:s$<b>64<i>0<i>2<i>1<b>0]"
75 + "[(1-2)s:d|_2$<i>1<i>2|<>:s$<b>64<i>1<i>2<i>3]"
76 + "[(2-3)s:e|_3$<i>2<i>3<b>0]");
Nils Diewaldbb33da22015-03-04 16:24:25 +000077 return fd;
78 }
79
Eliza Margaretha6f989202016-10-14 21:48:29 +020080
81 @Test
Eliza Margaretha6517c502016-10-10 18:19:24 +020082 public void testTermQuery () throws IOException {
83 ki = new KrillIndex();
84 ki.addDoc(createFieldDoc0());
85 ki.commit();
Eliza Margaretha6f989202016-10-14 21:48:29 +020086
Eliza Margaretha6517c502016-10-10 18:19:24 +020087 // Quantifier only
88 // c{1,2}
89 SpanQuery sq = new SpanRepetitionQuery(
90 new SpanTermQuery(new Term("base", "s:c")), 1, 2, true);
91 kr = ki.search(sq, (short) 10);
92 // 0-1, 2-3, 2-4, 3-4, 5-6
93 assertEquals((long) 5, kr.getTotalResults());
94 assertEquals(0, kr.getMatch(0).getStartPos());
95 assertEquals(1, kr.getMatch(0).getEndPos());
96 assertEquals(2, kr.getMatch(1).getStartPos());
97 assertEquals(3, kr.getMatch(1).getEndPos());
98 assertEquals(2, kr.getMatch(2).getStartPos());
99 assertEquals(4, kr.getMatch(2).getEndPos());
100 assertEquals(3, kr.getMatch(3).getStartPos());
101 assertEquals(4, kr.getMatch(3).getEndPos());
102 assertEquals(5, kr.getMatch(4).getStartPos());
103 assertEquals(6, kr.getMatch(4).getEndPos());
104 }
Eliza Margaretha6f989202016-10-14 21:48:29 +0200105
106
Nils Diewaldbb33da22015-03-04 16:24:25 +0000107 @Test
Eliza Margaretha6f989202016-10-14 21:48:29 +0200108 public void testRepetitionInSequences () throws IOException {
Nils Diewaldbb33da22015-03-04 16:24:25 +0000109 ki = new KrillIndex();
Eliza Margarethad28469f2014-03-10 12:42:21 +0000110 ki.addDoc(createFieldDoc0());
111 ki.commit();
Nils Diewaldbb33da22015-03-04 16:24:25 +0000112
Eliza Margarethad28469f2014-03-10 12:42:21 +0000113 SpanQuery sq, sq2;
Eliza Margarethad28469f2014-03-10 12:42:21 +0000114 // ec{1,2}
Nils Diewaldbb33da22015-03-04 16:24:25 +0000115 sq = new SpanNextQuery(new SpanTermQuery(new Term("base", "s:e")),
Eliza Margaretha6f989202016-10-14 21:48:29 +0200116 new SpanRepetitionQuery(
117 new SpanTermQuery(new Term("base", "s:c")), 1, 2,
118 true));
Nils Diewaldbb33da22015-03-04 16:24:25 +0000119
Eliza Margarethad28469f2014-03-10 12:42:21 +0000120 kr = ki.search(sq, (short) 10);
121 // 1-3, 1-4, 4-6
Nils Diewaldbb33da22015-03-04 16:24:25 +0000122 assertEquals((long) 3, kr.getTotalResults());
Eliza Margaretha6517c502016-10-10 18:19:24 +0200123 assertEquals(1, kr.getMatch(0).getStartPos());
124 assertEquals(3, kr.getMatch(0).getEndPos());
125 assertEquals(1, kr.getMatch(1).getStartPos());
126 assertEquals(4, kr.getMatch(1).getEndPos());
127 assertEquals(4, kr.getMatch(2).getStartPos());
128 assertEquals(6, kr.getMatch(2).getEndPos());
Nils Diewaldbb33da22015-03-04 16:24:25 +0000129
Eliza Margarethad28469f2014-03-10 12:42:21 +0000130 // ec{1,2}d
Nils Diewaldbb33da22015-03-04 16:24:25 +0000131 sq2 = new SpanNextQuery(sq, new SpanTermQuery(new Term("base", "s:d")));
132 kr = ki.search(sq2, (short) 10);
133 assertEquals((long) 2, kr.getTotalResults());
Eliza Margarethad28469f2014-03-10 12:42:21 +0000134 assertEquals(1, kr.getMatch(0).startPos);
135 assertEquals(5, kr.getMatch(0).endPos);
136 assertEquals(4, kr.getMatch(1).startPos);
137 assertEquals(7, kr.getMatch(1).endPos);
Nils Diewaldbb33da22015-03-04 16:24:25 +0000138
Eliza Margarethad28469f2014-03-10 12:42:21 +0000139 // Multiple documents
140 ki.addDoc(createFieldDoc1());
141 ki.commit();
142 kr = ki.search(sq2, (short) 10);
Nils Diewalde1ecd5e2014-11-27 02:17:24 +0000143 assertEquals((long) 5, kr.getTotalResults());
Nils Diewaldbb33da22015-03-04 16:24:25 +0000144 }
145
Eliza Margaretha6f989202016-10-14 21:48:29 +0200146
Eliza Margaretha6517c502016-10-10 18:19:24 +0200147 @Test
Eliza Margaretha6f989202016-10-14 21:48:29 +0200148 public void testMinZeroRepetition () throws IOException {
149 ki = new KrillIndex();
Eliza Margaretha6517c502016-10-10 18:19:24 +0200150 ki.addDoc(createFieldDoc0());
151 ki.commit();
Eliza Margaretha6f989202016-10-14 21:48:29 +0200152
153 SpanQuery sq, sq2;
154 sq = new SpanTermQuery(new Term("base", "s:e"));
155 kr = ki.search(sq, (short) 10);
156
157 assertEquals((long) 4, kr.getTotalResults());
Eliza Margaretha6517c502016-10-10 18:19:24 +0200158 assertEquals(1, kr.getMatch(0).getStartPos());
159 assertEquals(2, kr.getMatch(0).getEndPos());
160 assertEquals(4, kr.getMatch(1).getStartPos());
161 assertEquals(5, kr.getMatch(1).getEndPos());
162 assertEquals(7, kr.getMatch(2).getStartPos());
163 assertEquals(8, kr.getMatch(2).getEndPos());
164 assertEquals(8, kr.getMatch(3).getStartPos());
165 assertEquals(9, kr.getMatch(3).getEndPos());
Eliza Margaretha6f989202016-10-14 21:48:29 +0200166 try {
167 sq2 = new SpanNextQuery(sq, new SpanRepetitionQuery(
168 new SpanTermQuery(new Term("base", "s:c")), 0, 1, true));
169 }
170 catch (IllegalArgumentException e) {
171 assertEquals("Minimum repetition must not lower than 1.",
172 e.getMessage());
173 }
Eliza Margaretha6517c502016-10-10 18:19:24 +0200174 }
Nils Diewaldbb33da22015-03-04 16:24:25 +0000175
Eliza Margaretha6f989202016-10-14 21:48:29 +0200176
Nils Diewaldbb33da22015-03-04 16:24:25 +0000177 /** Skip to */
178 @Test
179 public void testCase2 () throws IOException {
180 ki = new KrillIndex();
Eliza Margarethad28469f2014-03-10 12:42:21 +0000181 ki.addDoc(createFieldDoc0());
182 ki.addDoc(createFieldDoc3());
183 ki.addDoc(createFieldDoc2());
184 ki.addDoc(createFieldDoc1());
185 ki.commit();
Nils Diewaldbb33da22015-03-04 16:24:25 +0000186
Eliza Margarethad28469f2014-03-10 12:42:21 +0000187 SpanQuery sq;
Eliza Margaretha7788a982014-08-29 16:10:52 +0000188 // c{2,2}
margaretha4cfc89e2016-04-25 18:01:14 +0200189 // sq = new SpanRepetitionQuery(
190 // new SpanTermQuery(new Term("base", "s:c")), 2, 2, true);
191 // kr = ki.search(sq, (short) 10);
192 // // doc1 2-4, 3-5, 4-6
193 // assertEquals((long) 6, kr.getTotalResults());
Nils Diewaldbb33da22015-03-04 16:24:25 +0000194
Eliza Margaretha7788a982014-08-29 16:10:52 +0000195 // ec{2,2}
Nils Diewaldbb33da22015-03-04 16:24:25 +0000196 sq = new SpanNextQuery(new SpanTermQuery(new Term("base", "s:e")),
Eliza Margaretha6f989202016-10-14 21:48:29 +0200197 new SpanRepetitionQuery(
198 new SpanTermQuery(new Term("base", "s:c")), 2, 2,
199 true));
Nils Diewaldbb33da22015-03-04 16:24:25 +0000200
201 kr = ki.search(sq, (short) 10);
202 assertEquals((long) 2, kr.getTotalResults());
203 assertEquals(3, kr.getMatch(1).getLocalDocID());
204
205 }
206
207
208 /** OR */
209 @Test
210 public void testCase3 () throws IOException {
211 ki = new KrillIndex();
Eliza Margarethad28469f2014-03-10 12:42:21 +0000212 ki.addDoc(createFieldDoc0());
213 ki.commit();
Nils Diewaldbb33da22015-03-04 16:24:25 +0000214
215 SpanQuery sq, sq2;
216 // ec{1,2}
217 sq = new SpanNextQuery(new SpanTermQuery(new Term("base", "s:e")),
Eliza Margaretha6f989202016-10-14 21:48:29 +0200218 new SpanOrQuery(new SpanRepetitionQuery(
219 new SpanTermQuery(new Term("base", "s:c")), 1, 1, true),
220 new SpanRepetitionQuery(
221 new SpanTermQuery(new Term("base", "s:b")), 1,
222 1, true)));
Nils Diewaldbb33da22015-03-04 16:24:25 +0000223 kr = ki.search(sq, (short) 10);
224 assertEquals((long) 3, kr.getTotalResults());
Eliza Margarethad28469f2014-03-10 12:42:21 +0000225 assertEquals(1, kr.getMatch(0).startPos);
226 assertEquals(3, kr.getMatch(0).endPos);
227 assertEquals(4, kr.getMatch(1).startPos);
228 assertEquals(6, kr.getMatch(1).endPos);
229 assertEquals(7, kr.getMatch(2).startPos);
230 assertEquals(9, kr.getMatch(2).endPos);
Eliza Margaretha7788a982014-08-29 16:10:52 +0000231
Nils Diewaldbb33da22015-03-04 16:24:25 +0000232 }
233
234
235 @Test
236 public void testCase4 () throws IOException {
237 ki = new KrillIndex();
Eliza Margaretha7788a982014-08-29 16:10:52 +0000238 ki.addDoc(createFieldDoc1());
239 ki.commit();
Nils Diewaldbb33da22015-03-04 16:24:25 +0000240
241 SpanQuery sq;
Eliza Margaretha7788a982014-08-29 16:10:52 +0000242 // c{2,2}
Eliza Margaretha6f989202016-10-14 21:48:29 +0200243 sq = new SpanRepetitionQuery(new SpanTermQuery(new Term("base", "s:c")),
244 1, 3, true);
Eliza Margaretha7788a982014-08-29 16:10:52 +0000245 kr = ki.search(sq, (short) 10);
246 // 2-3, 2-4, 2-5, 3-4, 3-5, 3-6, 4-5, 4-6, 5-6, 7-8
Nils Diewaldbb33da22015-03-04 16:24:25 +0000247 assertEquals((long) 10, kr.getTotalResults());
248
Eliza Margaretha6f989202016-10-14 21:48:29 +0200249 sq = new SpanRepetitionQuery(new SpanTermQuery(new Term("base", "s:c")),
250 2, 3, true);
Eliza Margaretha7788a982014-08-29 16:10:52 +0000251 kr = ki.search(sq, (short) 10);
252 // 2-4, 2-5, 3-5, 3-6, 4-6
Nils Diewaldbb33da22015-03-04 16:24:25 +0000253 assertEquals((long) 5, kr.getTotalResults());
254
255 // System.out.print(kr.getTotalResults()+"\n");
256 // for (int i=0; i< kr.getTotalResults(); i++){
257 // System.out.println(
258 // kr.match(i).getLocalDocID()+" "+
259 // kr.match(i).startPos + " " +
260 // kr.match(i).endPos
261 // );
262 // }
263 }
264
Akronf9def5e2016-10-10 21:26:46 +0200265
Nils Diewaldbb33da22015-03-04 16:24:25 +0000266 @Test
267 public void testCase5 () throws IOException {
268 ki = new KrillIndex();
269 ki.addDoc(getClass().getResourceAsStream("/wiki/00001.json.gz"), true);
270 ki.commit();
271
272 SpanQuery sq0, sq1, sq2;
273 sq0 = new SpanTermQuery(new Term("tokens", "tt/p:NN"));
Eliza Margaretha6f989202016-10-14 21:48:29 +0200274 sq1 = new SpanRepetitionQuery(
275 new SpanTermQuery(new Term("tokens", "tt/p:ADJA")), 2, 3, true);
Nils Diewaldbb33da22015-03-04 16:24:25 +0000276 sq2 = new SpanNextQuery(sq1, sq0);
Eliza Margaretha7788a982014-08-29 16:10:52 +0000277 kr = ki.search(sq2, (short) 10);
Nils Diewaldbb33da22015-03-04 16:24:25 +0000278
279 assertEquals((long) 2, kr.getTotalResults());
Eliza Margarethafaa548f2014-09-30 17:22:11 +0000280 assertEquals(73, kr.getMatch(0).getStartPos());
281 assertEquals(77, kr.getMatch(0).getEndPos());
282 assertEquals(74, kr.getMatch(1).getStartPos());
283 assertEquals(77, kr.getMatch(1).getEndPos());
Nils Diewaldbb33da22015-03-04 16:24:25 +0000284
Eliza Margaretha6f989202016-10-14 21:48:29 +0200285
286 sq2 = new SpanNextQuery(
287 new SpanTermQuery(new Term("tokens", "s:offenen")), sq2);
Eliza Margaretha7788a982014-08-29 16:10:52 +0000288 kr = ki.search(sq2, (short) 10);
Nils Diewaldbb33da22015-03-04 16:24:25 +0000289
290 assertEquals((long) 1, kr.getTotalResults());
Eliza Margarethafaa548f2014-09-30 17:22:11 +0000291 assertEquals(73, kr.getMatch(0).getStartPos());
292 assertEquals(77, kr.getMatch(0).getEndPos());
293 /*
Nils Diewald392bcf32015-02-26 20:01:17 +0000294 for (Match km : kr.getMatches()){
Eliza Margaretha7788a982014-08-29 16:10:52 +0000295 System.out.println(km.getSnippetBrackets());
296 System.out.println(km.getStartPos() +","+km.getEndPos());
297 }*/
Akrondfc93572016-08-10 19:01:34 +0200298 };
Eliza Margarethad28469f2014-03-10 12:42:21 +0000299}