blob: b4043736d508936fddb50e6bea07e1cb367a43d9 [file] [log] [blame]
Eliza Margarethad28469f2014-03-10 12:42:21 +00001package de.ids_mannheim.korap.index;
2
3import static org.junit.Assert.*;
4
5import java.io.IOException;
6
7import org.apache.lucene.index.Term;
8import org.apache.lucene.search.spans.SpanOrQuery;
9import org.apache.lucene.search.spans.SpanQuery;
10import org.apache.lucene.search.spans.SpanTermQuery;
11import org.junit.Test;
12
13import de.ids_mannheim.korap.KorapIndex;
Eliza Margaretha7788a982014-08-29 16:10:52 +000014import de.ids_mannheim.korap.KorapMatch;
Eliza Margarethad28469f2014-03-10 12:42:21 +000015import de.ids_mannheim.korap.KorapResult;
16import de.ids_mannheim.korap.query.SpanNextQuery;
Eliza Margarethad4693462014-03-17 13:16:18 +000017import de.ids_mannheim.korap.query.SpanRepetitionQuery;
Eliza Margarethad28469f2014-03-10 12:42:21 +000018
Eliza Margarethad4693462014-03-17 13:16:18 +000019public class TestRepetitionIndex {
Eliza Margarethad28469f2014-03-10 12:42:21 +000020
21 private KorapIndex ki;
22 private KorapResult kr;
23
24 private FieldDocument createFieldDoc0(){
25 FieldDocument fd = new FieldDocument();
26 fd.addString("ID", "doc-0");
27 fd.addTV("base",
28 "text",
29 "[(0-1)s:c|_1#0-1]" +
30 "[(1-2)s:e|_2#1-2]" +
31 "[(2-3)s:c|_3#2-3|<>:y#2-4$<i>4]" +
32 "[(3-4)s:c|s:b|_4#3-4|<>:x#3-7$<i>7]" +
33 "[(4-5)s:e|s:d|_5#4-5|<>:y#4-6$<i>6]" +
34 "[(5-6)s:c|_6#5-6|<>:y#5-8$<i>8]" +
35 "[(6-7)s:d|_7#6-7]" +
36 "[(7-8)s:e|_8#7-8|<>:x#7-9$<i>9]" +
37 "[(8-9)s:e|s:b|_9#8-9|<>:x#8-10$<i>10]" +
38 "[(9-10)s:d|_10#9-10]");
39 return fd;
40 }
41
42 private FieldDocument createFieldDoc1() {
43 FieldDocument fd = new FieldDocument();
44 fd.addString("ID", "doc-1");
45 fd.addTV("base",
46 "text",
Eliza Margaretha7788a982014-08-29 16:10:52 +000047 "[(0-1)s:b|_1#0-1]" +
48 "[(1-2)s:e|_2#1-2]" +
Eliza Margarethad28469f2014-03-10 12:42:21 +000049 "[(2-3)s:c|_3#2-3]" +
Eliza Margaretha7788a982014-08-29 16:10:52 +000050 "[(3-4)s:c|s:d]" +
51 "[(4-5)s:d|s:c|_5#4-5]" +
52 "[(5-6)s:e|s:c|_6#5-6]" +
Eliza Margarethad28469f2014-03-10 12:42:21 +000053 "[(6-7)s:e|_7#6-7]" +
Eliza Margaretha7788a982014-08-29 16:10:52 +000054 "[(7-8)s:c|_8#7-8]" +
55 "[(8-9)s:d|_9#8-9]" +
Eliza Margarethad28469f2014-03-10 12:42:21 +000056 "[(9-10)s:d|_10#9-10]");
57 return fd;
58 }
59
60 private FieldDocument createFieldDoc2() {
61 FieldDocument fd = new FieldDocument();
62 fd.addString("ID", "doc-2");
63 fd.addTV("base",
64 "text",
65 "[(0-1)s:b|s:c|_1#0-1|<>:s#0-2$<i>1]" +
66 "[(1-2)s:c|_2#1-2]" +
67 "[(2-3)s:b|_3#2-3|<>:s#2-3$<i>3]" +
68 "[(3-4)s:c|_4#3-4|<>:s#3-4$<i>4]" +
69 "[(4-5)s:c|_5#4-5|<>:s#4-5$<i>5]" +
70 "[(5-6)s:b|_6#5-6]" +
71 "[(6-7)s:c|_7#6-7|<>:s#6-7$<i>7]");
72 return fd;
73 }
74
75 private FieldDocument createFieldDoc3() {
76 FieldDocument fd = new FieldDocument();
77 fd.addString("ID", "doc-3");
78 fd.addTV("base",
79 "text",
80 "[(0-1)s:a|_1#0-1|<>:s#0-2$<i>1]" +
81 "[(1-2)s:d|_2#1-2|<>:s#1-2$<i>3]" +
82 "[(2-3)s:e|_3#2-3]");
83 return fd;
84 }
Eliza Margaretha7788a982014-08-29 16:10:52 +000085
Eliza Margarethad28469f2014-03-10 12:42:21 +000086 @Test
87 public void testCase1() throws IOException{
88 ki = new KorapIndex();
89 ki.addDoc(createFieldDoc0());
90 ki.commit();
91
92 SpanQuery sq, sq2;
93 // Quantifier only
Eliza Margaretha7788a982014-08-29 16:10:52 +000094 // c{1,2}
Eliza Margarethad4693462014-03-17 13:16:18 +000095 sq = new SpanRepetitionQuery(new SpanTermQuery(new Term("base","s:c")),1,2, true);
Eliza Margarethad28469f2014-03-10 12:42:21 +000096 kr = ki.search(sq, (short) 10);
97 // 0-1, 2-3, 2-4, 3-4, 5-6
98 assertEquals(5,kr.getTotalResults());
99
100 // ec{1,2}
101 sq = new SpanNextQuery(
102 new SpanTermQuery(new Term("base", "s:e")),
Eliza Margarethad4693462014-03-17 13:16:18 +0000103 new SpanRepetitionQuery(new SpanTermQuery(new Term("base","s:c")),1,2, true)
Eliza Margarethad28469f2014-03-10 12:42:21 +0000104 );
105
106 kr = ki.search(sq, (short) 10);
107 // 1-3, 1-4, 4-6
108 assertEquals(3,kr.getTotalResults());
109
110 // ec{1,2}d
111 sq2 = new SpanNextQuery(sq, new SpanTermQuery(new Term("base", "s:d")));
112 kr = ki.search(sq2, (short) 10);
113 assertEquals(2,kr.getTotalResults());
114 assertEquals(1, kr.getMatch(0).startPos);
115 assertEquals(5, kr.getMatch(0).endPos);
116 assertEquals(4, kr.getMatch(1).startPos);
117 assertEquals(7, kr.getMatch(1).endPos);
118
119 // Multiple documents
120 ki.addDoc(createFieldDoc1());
121 ki.commit();
122 kr = ki.search(sq2, (short) 10);
123 assertEquals(5,kr.getTotalResults());
124 }
125
126 /** Skip to */
127 @Test
128 public void testCase2() throws IOException{
129 ki = new KorapIndex();
130 ki.addDoc(createFieldDoc0());
131 ki.addDoc(createFieldDoc3());
132 ki.addDoc(createFieldDoc2());
133 ki.addDoc(createFieldDoc1());
134 ki.commit();
135
136 SpanQuery sq;
Eliza Margaretha7788a982014-08-29 16:10:52 +0000137 // c{2,2}
Eliza Margarethad4693462014-03-17 13:16:18 +0000138 sq = new SpanRepetitionQuery(new SpanTermQuery(new Term("base","s:c")),2,2, true);
Eliza Margarethad28469f2014-03-10 12:42:21 +0000139 kr = ki.search(sq, (short) 10);
Eliza Margaretha7788a982014-08-29 16:10:52 +0000140 // doc1 2-4, 3-5, 4-6
141 assertEquals(6,kr.getTotalResults());
142
143 // ec{2,2}
Eliza Margarethad28469f2014-03-10 12:42:21 +0000144 kr = ki.search(sq, (short) 10);
145 sq = new SpanNextQuery(
146 new SpanTermQuery(new Term("base", "s:e")),
Eliza Margarethad4693462014-03-17 13:16:18 +0000147 new SpanRepetitionQuery(new SpanTermQuery(new Term("base","s:c")),2,2, true)
Eliza Margarethad28469f2014-03-10 12:42:21 +0000148 );
149
150 kr = ki.search(sq, (short) 10);
151 assertEquals(2,kr.getTotalResults());
152 assertEquals(3,kr.getMatch(1).getLocalDocID());
153
154 }
155
156 /** OR */
157 @Test
158 public void testCase3() throws IOException{
159 ki = new KorapIndex();
160 ki.addDoc(createFieldDoc0());
161 ki.commit();
162
163 SpanQuery sq,sq2;
164 // ec{1,2}
165 sq = new SpanNextQuery(
166 new SpanTermQuery(new Term("base", "s:e")),
167 new SpanOrQuery(
Eliza Margarethad4693462014-03-17 13:16:18 +0000168 new SpanRepetitionQuery(new SpanTermQuery(new Term("base","s:c")),1,1, true),
169 new SpanRepetitionQuery(new SpanTermQuery(new Term("base","s:b")),1,1, true)
Eliza Margarethad28469f2014-03-10 12:42:21 +0000170 )
171 );
172 kr = ki.search(sq, (short) 10);
173 assertEquals(3,kr.getTotalResults());
174 assertEquals(1, kr.getMatch(0).startPos);
175 assertEquals(3, kr.getMatch(0).endPos);
176 assertEquals(4, kr.getMatch(1).startPos);
177 assertEquals(6, kr.getMatch(1).endPos);
178 assertEquals(7, kr.getMatch(2).startPos);
179 assertEquals(9, kr.getMatch(2).endPos);
Eliza Margaretha7788a982014-08-29 16:10:52 +0000180
181 }
182
183 @Test
184 public void testCase4() throws IOException {
185 ki = new KorapIndex();
186 ki.addDoc(createFieldDoc1());
187 ki.commit();
188
189 SpanQuery sq;
190 // c{2,2}
191 sq = new SpanRepetitionQuery(new SpanTermQuery(new Term("base","s:c")),1,3, true);
192 kr = ki.search(sq, (short) 10);
193 // 2-3, 2-4, 2-5, 3-4, 3-5, 3-6, 4-5, 4-6, 5-6, 7-8
194 assertEquals(10,kr.getTotalResults());
195
196 sq = new SpanRepetitionQuery(new SpanTermQuery(new Term("base","s:c")),2,3, true);
197 kr = ki.search(sq, (short) 10);
198 // 2-4, 2-5, 3-5, 3-6, 4-6
199 assertEquals(5,kr.getTotalResults());
Eliza Margarethad28469f2014-03-10 12:42:21 +0000200
201// System.out.print(kr.getTotalResults()+"\n");
202// for (int i=0; i< kr.getTotalResults(); i++){
203// System.out.println(
204// kr.match(i).getLocalDocID()+" "+
205// kr.match(i).startPos + " " +
206// kr.match(i).endPos
Eliza Margaretha7788a982014-08-29 16:10:52 +0000207// );
Eliza Margarethad28469f2014-03-10 12:42:21 +0000208// }
209 }
Eliza Margaretha7788a982014-08-29 16:10:52 +0000210
211 @Test
212 public void testCase5() throws IOException {
213 ki = new KorapIndex();
Eliza Margarethafaa548f2014-09-30 17:22:11 +0000214 ki.addDocFile(
215 getClass().getResource("/wiki/00001.json.gz").getFile(), true
216 );
Eliza Margaretha7788a982014-08-29 16:10:52 +0000217 ki.commit();
218
219 SpanQuery sq0, sq1, sq2;
Eliza Margarethafaa548f2014-09-30 17:22:11 +0000220 sq0 = new SpanTermQuery(new Term("tokens", "tt/p:NN"));
221 sq1 = new SpanRepetitionQuery(new SpanTermQuery(new Term("tokens","tt/p:ADJA")),2,3, true);
Eliza Margaretha7788a982014-08-29 16:10:52 +0000222 sq2 = new SpanNextQuery(sq1,sq0);
223 kr = ki.search(sq2, (short) 10);
224
225 assertEquals(2,kr.getTotalResults());
Eliza Margarethafaa548f2014-09-30 17:22:11 +0000226 assertEquals(73, kr.getMatch(0).getStartPos());
227 assertEquals(77, kr.getMatch(0).getEndPos());
228 assertEquals(74, kr.getMatch(1).getStartPos());
229 assertEquals(77, kr.getMatch(1).getEndPos());
230 /* for (KorapMatch km : kr.getMatches()){
231 System.out.println(km.getSnippetBrackets());
232 System.out.println(km.getStartPos() +","+km.getEndPos());
233 }*/
Eliza Margaretha7788a982014-08-29 16:10:52 +0000234
235 sq2 = new SpanNextQuery(
Eliza Margarethafaa548f2014-09-30 17:22:11 +0000236 new SpanTermQuery(new Term("tokens", "s:offenen")),
Eliza Margaretha7788a982014-08-29 16:10:52 +0000237 sq2);
238 kr = ki.search(sq2, (short) 10);
239
240 assertEquals(1,kr.getTotalResults());
Eliza Margarethafaa548f2014-09-30 17:22:11 +0000241 assertEquals(73, kr.getMatch(0).getStartPos());
242 assertEquals(77, kr.getMatch(0).getEndPos());
243 /*
244 for (KorapMatch km : kr.getMatches()){
Eliza Margaretha7788a982014-08-29 16:10:52 +0000245 System.out.println(km.getSnippetBrackets());
246 System.out.println(km.getStartPos() +","+km.getEndPos());
247 }*/
248 }
Eliza Margarethad28469f2014-03-10 12:42:21 +0000249}