blob: bff660804a2ed6110c32fb672f285822b4c0cb5b [file] [log] [blame]
Eliza Margaretha01929182014-02-19 11:48:59 +00001package de.ids_mannheim.korap.index;
2
margaretha4f995582015-12-14 14:14:34 +01003import static org.junit.Assert.assertEquals;
Nils Diewaldb0dd9552013-12-20 02:28:34 +00004
margaretha4f995582015-12-14 14:14:34 +01005import org.apache.lucene.search.spans.SpanQuery;
Nils Diewaldb0dd9552013-12-20 02:28:34 +00006import org.junit.Test;
Nils Diewaldb0dd9552013-12-20 02:28:34 +00007import org.junit.runner.RunWith;
8import org.junit.runners.JUnit4;
9
Nils Diewaldbbd39a52015-02-23 19:56:57 +000010import de.ids_mannheim.korap.Krill;
margaretha4f995582015-12-14 14:14:34 +010011import de.ids_mannheim.korap.KrillIndex;
Nils Diewald8904c1d2015-02-26 16:13:18 +000012import de.ids_mannheim.korap.query.QueryBuilder;
margaretha4f995582015-12-14 14:14:34 +010013import de.ids_mannheim.korap.response.Result;
Nils Diewaldb0dd9552013-12-20 02:28:34 +000014
15@RunWith(JUnit4.class)
16public class TestRegexWildcardIndex {
17
18 @Test
Nils Diewaldbe5943e2014-10-21 19:35:34 +000019 public void indexRegex () throws Exception {
Nils Diewaldbb33da22015-03-04 16:24:25 +000020 KrillIndex ki = new KrillIndex();
Nils Diewaldb0dd9552013-12-20 02:28:34 +000021
Nils Diewaldbb33da22015-03-04 16:24:25 +000022 // abcabcabac
23 FieldDocument fd = new FieldDocument();
24 fd.addTV(
25 "base",
26 "affe afffe baum baumgarten steingarten franz hans haus efeu effe",
margaretha4f995582015-12-14 14:14:34 +010027 "[(0-4)s:affe|_0$<i>0<i>4|-:t$<i>10]"
28 + "[(5-10)s:afffe|_1$<i>5<i>10]"
29 + "[(11-15)s:baum|_2$<i>11<i>15]"
30 + "[(16-26)s:baumgarten|_3$<i>16<i>26]"
31 + "[(27-38)s:steingarten|_4$<i>27<i>38]"
32 + "[(39-44)s:franz|_5$<i>39<i>44]"
33 + "[(45-49)s:hans|_6$<i>45<i>49]"
34 + "[(50-54)s:haus|_7$<i>50<i>54]"
35 + "[(55-59)s:efeu|_8$<i>55<i>59]"
36 + "[(60-64)s:effe|_9$<i>60<i>64]");
Nils Diewaldbb33da22015-03-04 16:24:25 +000037 ki.addDoc(fd);
Nils Diewaldb0dd9552013-12-20 02:28:34 +000038
Nils Diewaldbb33da22015-03-04 16:24:25 +000039 ki.commit();
Nils Diewaldb0dd9552013-12-20 02:28:34 +000040
Nils Diewaldbb33da22015-03-04 16:24:25 +000041 QueryBuilder kq = new QueryBuilder("base");
42 SpanQuery sq = kq.re("s:af*e").toQuery();
43 assertEquals("SpanMultiTermQueryWrapper(base:/s:af*e/)", sq.toString());
Nils Diewaldb0dd9552013-12-20 02:28:34 +000044
Nils Diewaldbb33da22015-03-04 16:24:25 +000045 Krill ks = new Krill(sq);
46 ks.getMeta().getContext().left.setToken(true).setLength(1);
47 ks.getMeta().getContext().right.setToken(true).setLength(1);
Nils Diewaldb0dd9552013-12-20 02:28:34 +000048
Nils Diewaldbb33da22015-03-04 16:24:25 +000049 Result kr = ki.search(ks);
50 assertEquals((long) 2, kr.getTotalResults());
51 assertEquals("[affe] afffe ...", kr.getMatch(0).getSnippetBrackets());
52 assertEquals("affe [afffe] baum ...", kr.getMatch(1)
53 .getSnippetBrackets());
Nils Diewaldb0dd9552013-12-20 02:28:34 +000054
Nils Diewaldbb33da22015-03-04 16:24:25 +000055 kr = ki.search(ks.setSpanQuery(new QueryBuilder("base").re("s:baum.*")
56 .toQuery()));
57 assertEquals((long) 2, kr.getTotalResults());
58 assertEquals("... afffe [baum] baumgarten ...", kr.getMatch(0)
59 .getSnippetBrackets());
60 assertEquals("... baum [baumgarten] steingarten ...", kr.getMatch(1)
61 .getSnippetBrackets());
Nils Diewaldb0dd9552013-12-20 02:28:34 +000062
Nils Diewaldbb33da22015-03-04 16:24:25 +000063 kr = ki.search(ks.setSpanQuery(new QueryBuilder("base").re(
64 "s:.....?garten").toQuery()));
65 assertEquals((long) 2, kr.getTotalResults());
66 assertEquals("... baum [baumgarten] steingarten ...", kr.getMatch(0)
67 .getSnippetBrackets());
68 assertEquals("... baumgarten [steingarten] franz ...", kr.getMatch(1)
69 .getSnippetBrackets());
Nils Diewaldb0dd9552013-12-20 02:28:34 +000070
Nils Diewaldbb33da22015-03-04 16:24:25 +000071 kr = ki.search(ks.setSpanQuery(new QueryBuilder("base").re("s:ha.s")
72 .toQuery()));
73 assertEquals((long) 2, kr.getTotalResults());
74 assertEquals("... franz [hans] haus ...", kr.getMatch(0)
75 .getSnippetBrackets());
76 assertEquals("... hans [haus] efeu ...", kr.getMatch(1)
77 .getSnippetBrackets());
78
79 kr = ki.search(ks.setSpanQuery(new QueryBuilder("base").re("s:.*ff.*")
80 .toQuery()));
81 assertEquals((long) 3, kr.getTotalResults());
82 assertEquals("[affe] afffe ...", kr.getMatch(0).getSnippetBrackets());
83 assertEquals("affe [afffe] baum ...", kr.getMatch(1)
84 .getSnippetBrackets());
85 assertEquals("... efeu [effe]", kr.getMatch(2).getSnippetBrackets());
Nils Diewaldb0dd9552013-12-20 02:28:34 +000086 };
87
Nils Diewaldbb33da22015-03-04 16:24:25 +000088
Nils Diewaldb0dd9552013-12-20 02:28:34 +000089 @Test
Nils Diewaldbe5943e2014-10-21 19:35:34 +000090 public void indexWildcard () throws Exception {
Nils Diewaldbb33da22015-03-04 16:24:25 +000091 KrillIndex ki = new KrillIndex();
Nils Diewaldb0dd9552013-12-20 02:28:34 +000092
Nils Diewaldbb33da22015-03-04 16:24:25 +000093 // abcabcabac
94 FieldDocument fd = new FieldDocument();
95 fd.addTV(
96 "base",
97 "affe afffe baum baumgarten steingarten franz hans haus efeu effe",
margaretha4f995582015-12-14 14:14:34 +010098 "[(0-4)s:affe|_0$<i>0<i>4|-:t$<i>10]"
99 + "[(5-10)s:afffe|_1$<i>5<i>10]"
100 + "[(11-15)s:baum|_2$<i>11<i>15]"
101 + "[(16-26)s:baumgarten|_3$<i>16<i>26]"
102 + "[(27-38)s:steingarten|_4$<i>27<i>38]"
103 + "[(39-44)s:franz|_5$<i>39<i>44]"
104 + "[(45-49)s:hans|_6$<i>45<i>49]"
105 + "[(50-54)s:haus|_7$<i>50<i>54]"
106 + "[(55-59)s:efeu|_8$<i>55<i>59]"
107 + "[(60-64)s:effe|_9$<i>60<i>64]");
Nils Diewaldbb33da22015-03-04 16:24:25 +0000108 ki.addDoc(fd);
Nils Diewaldb0dd9552013-12-20 02:28:34 +0000109
Nils Diewaldbb33da22015-03-04 16:24:25 +0000110 ki.commit();
Nils Diewaldb0dd9552013-12-20 02:28:34 +0000111
Nils Diewaldbb33da22015-03-04 16:24:25 +0000112 QueryBuilder kq = new QueryBuilder("base");
113 SpanQuery sq = kq.wc("s:af*e").toQuery();
114 assertEquals("SpanMultiTermQueryWrapper(base:s:af*e)", sq.toString());
Nils Diewaldb0dd9552013-12-20 02:28:34 +0000115
Nils Diewaldbb33da22015-03-04 16:24:25 +0000116 Krill ks = new Krill(sq);
117 ks.getMeta().getContext().left.setToken(true).setLength(1);
118 ks.getMeta().getContext().right.setToken(true).setLength(1);
Nils Diewaldb0dd9552013-12-20 02:28:34 +0000119
Nils Diewaldbb33da22015-03-04 16:24:25 +0000120 Result kr = ki.search(ks);
121 assertEquals((long) 2, kr.getTotalResults());
122 assertEquals("[affe] afffe ...", kr.getMatch(0).getSnippetBrackets());
123 assertEquals("affe [afffe] baum ...", kr.getMatch(1)
124 .getSnippetBrackets());
Nils Diewaldb0dd9552013-12-20 02:28:34 +0000125
Nils Diewaldbb33da22015-03-04 16:24:25 +0000126 kr = ki.search(ks.setSpanQuery(new QueryBuilder("base").wc("s:baum.*")
127 .toQuery()));
128 assertEquals((long) 0, kr.getTotalResults());
Nils Diewaldb0dd9552013-12-20 02:28:34 +0000129
Nils Diewaldbb33da22015-03-04 16:24:25 +0000130 kr = ki.search(ks.setSpanQuery(new QueryBuilder("base").wc("s:baum*")
131 .toQuery()));
132 assertEquals((long) 2, kr.getTotalResults());
133 assertEquals("... afffe [baum] baumgarten ...", kr.getMatch(0)
134 .getSnippetBrackets());
135 assertEquals("... baum [baumgarten] steingarten ...", kr.getMatch(1)
136 .getSnippetBrackets());
Nils Diewaldb0dd9552013-12-20 02:28:34 +0000137
Nils Diewaldbb33da22015-03-04 16:24:25 +0000138 kr = ki.search(ks.setSpanQuery(new QueryBuilder("base").wc("s:*garten")
139 .toQuery()));
140 assertEquals((long) 2, kr.getTotalResults());
141 assertEquals("... baum [baumgarten] steingarten ...", kr.getMatch(0)
142 .getSnippetBrackets());
143 assertEquals("... baumgarten [steingarten] franz ...", kr.getMatch(1)
144 .getSnippetBrackets());
Nils Diewaldb0dd9552013-12-20 02:28:34 +0000145
Nils Diewaldbb33da22015-03-04 16:24:25 +0000146 kr = ki.search(ks.setSpanQuery(new QueryBuilder("base").wc("s:ha?s")
147 .toQuery()));
148 assertEquals((long) 2, kr.getTotalResults());
149 assertEquals("... franz [hans] haus ...", kr.getMatch(0)
150 .getSnippetBrackets());
151 assertEquals("... hans [haus] efeu ...", kr.getMatch(1)
152 .getSnippetBrackets());
Nils Diewaldb0dd9552013-12-20 02:28:34 +0000153
Nils Diewaldbb33da22015-03-04 16:24:25 +0000154 kr = ki.search(ks.setSpanQuery(new QueryBuilder("base").wc("s:?ff?")
155 .toQuery()));
156 assertEquals((long) 2, kr.getTotalResults());
157 assertEquals("[affe] afffe ...", kr.getMatch(0).getSnippetBrackets());
158 assertEquals("... efeu [effe]", kr.getMatch(1).getSnippetBrackets());
Nils Diewaldb0dd9552013-12-20 02:28:34 +0000159 };
160
Nils Diewaldbb33da22015-03-04 16:24:25 +0000161
Nils Diewaldb0dd9552013-12-20 02:28:34 +0000162 @Test
Nils Diewaldbe5943e2014-10-21 19:35:34 +0000163 public void indexRegexCaseInsensitive () throws Exception {
Nils Diewaldbb33da22015-03-04 16:24:25 +0000164 KrillIndex ki = new KrillIndex();
Nils Diewaldb0dd9552013-12-20 02:28:34 +0000165
Nils Diewaldbb33da22015-03-04 16:24:25 +0000166 // abcabcabac
167 FieldDocument fd = new FieldDocument();
168 fd.addTV(
169 "base",
170 "AfFe aFfFE Baum Baumgarten SteinGarten franZ HaNs Haus Efeu effe",
margaretha4f995582015-12-14 14:14:34 +0100171 "[(0-4)s:AfFe|i:affe|_0$<i>0<i>4|-:t$<i>10]"
172 + "[(5-10)s:aFfFE|i:afffe|_1$<i>5<i>10]"
173 + "[(11-15)s:Baum|i:baum|_2$<i>11<i>15]"
174 + "[(16-26)s:Baumgarten|i:baumgarten|_3$<i>16<i>26]"
175 + "[(27-38)s:SteinGarten|i:steingarten|_4$<i>27<i>38]"
176 + "[(39-44)s:franZ|i:franz|_5$<i>39<i>44]"
177 + "[(45-49)s:HaNs|i:hans|_6$<i>45<i>49]"
178 + "[(50-54)s:Haus|i:haus|_7$<i>50<i>54]"
179 + "[(55-59)s:Efeu|i:efeu|_8$<i>55<i>59]"
180 + "[(60-64)s:effe|i:effe|_9$<i>60<i>64]");
Nils Diewaldbb33da22015-03-04 16:24:25 +0000181 ki.addDoc(fd);
Nils Diewaldb0dd9552013-12-20 02:28:34 +0000182
Nils Diewaldbb33da22015-03-04 16:24:25 +0000183 ki.commit();
Nils Diewaldb0dd9552013-12-20 02:28:34 +0000184
Nils Diewaldbb33da22015-03-04 16:24:25 +0000185 QueryBuilder kq = new QueryBuilder("base");
186 SpanQuery sq = kq.re("s:Af*e", true).toQuery();
187 assertEquals("SpanMultiTermQueryWrapper(base:/i:af*e/)", sq.toString());
Nils Diewaldb0dd9552013-12-20 02:28:34 +0000188
Nils Diewaldbb33da22015-03-04 16:24:25 +0000189 Krill ks = new Krill(sq);
190 ks.getMeta().getContext().left.setToken(true).setLength(1);
191 ks.getMeta().getContext().right.setToken(true).setLength(1);
Nils Diewaldb0dd9552013-12-20 02:28:34 +0000192
Nils Diewaldbb33da22015-03-04 16:24:25 +0000193 Result kr = ki.search(ks);
194 assertEquals((long) 2, kr.getTotalResults());
195 assertEquals("[AfFe] aFfFE ...", kr.getMatch(0).getSnippetBrackets());
196 assertEquals("AfFe [aFfFE] Baum ...", kr.getMatch(1)
197 .getSnippetBrackets());
Nils Diewaldb0dd9552013-12-20 02:28:34 +0000198
Nils Diewaldbb33da22015-03-04 16:24:25 +0000199 kr = ki.search(ks.setSpanQuery(new QueryBuilder("base").re("s:Af.*e")
200 .toQuery()));
201 assertEquals((long) 1, kr.getTotalResults());
202 assertEquals("[AfFe] aFfFE ...", kr.getMatch(0).getSnippetBrackets());
Nils Diewaldb0dd9552013-12-20 02:28:34 +0000203
Nils Diewaldbb33da22015-03-04 16:24:25 +0000204 kr = ki.search(ks.setSpanQuery(new QueryBuilder("base").re("s:baum.*",
205 true).toQuery()));
206 assertEquals((long) 2, kr.getTotalResults());
207 assertEquals("... aFfFE [Baum] Baumgarten ...", kr.getMatch(0)
208 .getSnippetBrackets());
209 assertEquals("... Baum [Baumgarten] SteinGarten ...", kr.getMatch(1)
210 .getSnippetBrackets());
Nils Diewaldb0dd9552013-12-20 02:28:34 +0000211
Nils Diewaldbb33da22015-03-04 16:24:25 +0000212 kr = ki.search(ks.setSpanQuery(new QueryBuilder("base").re(
213 "s:.*garten", true).toQuery()));
214 assertEquals((long) 2, kr.getTotalResults());
215 assertEquals("... Baum [Baumgarten] SteinGarten ...", kr.getMatch(0)
216 .getSnippetBrackets());
217 assertEquals("... Baumgarten [SteinGarten] franZ ...", kr.getMatch(1)
218 .getSnippetBrackets());
Nils Diewaldb0dd9552013-12-20 02:28:34 +0000219
Nils Diewaldbb33da22015-03-04 16:24:25 +0000220 kr = ki.search(ks.setSpanQuery(new QueryBuilder("base").re(
221 "s:.*garten", false).toQuery()));
222 assertEquals((long) 1, kr.getTotalResults());
223 assertEquals("... Baum [Baumgarten] SteinGarten ...", kr.getMatch(0)
224 .getSnippetBrackets());
Nils Diewaldb0dd9552013-12-20 02:28:34 +0000225
Nils Diewaldbb33da22015-03-04 16:24:25 +0000226 kr = ki.search(ks.setSpanQuery(new QueryBuilder("base").re("s:ha.s",
227 true).toQuery()));
228 assertEquals((long) 2, kr.getTotalResults());
229 assertEquals("... franZ [HaNs] Haus ...", kr.getMatch(0)
230 .getSnippetBrackets());
231 assertEquals("... HaNs [Haus] Efeu ...", kr.getMatch(1)
232 .getSnippetBrackets());
Nils Diewaldb0dd9552013-12-20 02:28:34 +0000233
Nils Diewaldbb33da22015-03-04 16:24:25 +0000234 kr = ki.search(ks.setSpanQuery(new QueryBuilder("base").re("s:.*f*e",
235 true).toQuery()));
236 assertEquals((long) 3, kr.getTotalResults());
237 assertEquals("[AfFe] aFfFE ...", kr.getMatch(0).getSnippetBrackets());
238 assertEquals("AfFe [aFfFE] Baum ...", kr.getMatch(1)
239 .getSnippetBrackets());
240 assertEquals("... Efeu [effe]", kr.getMatch(2).getSnippetBrackets());
Nils Diewaldb0dd9552013-12-20 02:28:34 +0000241 };
242
Nils Diewaldbb33da22015-03-04 16:24:25 +0000243
Nils Diewaldb0dd9552013-12-20 02:28:34 +0000244 @Test
Nils Diewaldbe5943e2014-10-21 19:35:34 +0000245 public void indexRegexCombined () throws Exception {
Nils Diewaldbb33da22015-03-04 16:24:25 +0000246 KrillIndex ki = new KrillIndex();
Nils Diewaldb0dd9552013-12-20 02:28:34 +0000247
Nils Diewaldbb33da22015-03-04 16:24:25 +0000248 // abcabcabac
249 FieldDocument fd = new FieldDocument();
250 fd.addTV(
251 "base",
252 "affe afffe baum baumgarten steingarten franz hans haus efeu effe",
margaretha4f995582015-12-14 14:14:34 +0100253 "[(0-4)s:affe|_0$<i>0<i>4|-:t$<i>10]"
254 + "[(5-10)s:afffe|_1$<i>5<i>10]"
255 + "[(11-15)s:baum|_2$<i>11<i>15]"
256 + "[(16-26)s:baumgarten|_3$<i>16<i>26]"
257 + "[(27-38)s:steingarten|_4$<i>27<i>38]"
258 + "[(39-44)s:franz|_5$<i>39<i>44]"
259 + "[(45-49)s:hans|_6$<i>45<i>49]"
260 + "[(50-54)s:haus|_7$<i>50<i>54]"
261 + "[(55-59)s:efeu|_8$<i>55<i>59]"
262 + "[(60-64)s:effe|_9$<i>60<i>64]");
Nils Diewaldbb33da22015-03-04 16:24:25 +0000263 ki.addDoc(fd);
Nils Diewaldb0dd9552013-12-20 02:28:34 +0000264
Nils Diewaldbb33da22015-03-04 16:24:25 +0000265 ki.commit();
Nils Diewaldb0dd9552013-12-20 02:28:34 +0000266
Nils Diewaldbb33da22015-03-04 16:24:25 +0000267 QueryBuilder kq = new QueryBuilder("base");
268 SpanQuery sq = kq.seq(kq.seg("s:affe")).append(kq.re("s:af*e"))
269 .toQuery();
270 assertEquals(
271 "spanNext(base:s:affe, SpanMultiTermQueryWrapper(base:/s:af*e/))",
272 sq.toString());
Nils Diewaldb0dd9552013-12-20 02:28:34 +0000273
Nils Diewaldbb33da22015-03-04 16:24:25 +0000274 Krill ks = new Krill(sq);
275 ks.getMeta().getContext().left.setToken(true).setLength(1);
276 ks.getMeta().getContext().right.setToken(true).setLength(1);
Nils Diewaldb0dd9552013-12-20 02:28:34 +0000277
Nils Diewaldbb33da22015-03-04 16:24:25 +0000278 Result kr = ki.search(ks);
279 assertEquals((long) 1, kr.getTotalResults());
280 assertEquals("[affe afffe] baum ...", kr.getMatch(0)
281 .getSnippetBrackets());
Nils Diewaldb0dd9552013-12-20 02:28:34 +0000282 };
Nils Diewaldb3a09db2013-12-21 00:22:02 +0000283
Nils Diewaldea125202014-09-19 15:12:06 +0000284
285 @Test
Nils Diewaldbe5943e2014-10-21 19:35:34 +0000286 public void indexRegexWithinRewrite () throws Exception {
Nils Diewaldbb33da22015-03-04 16:24:25 +0000287 KrillIndex ki = new KrillIndex();
Nils Diewaldea125202014-09-19 15:12:06 +0000288
Nils Diewaldbb33da22015-03-04 16:24:25 +0000289 // abcabcabac
290 FieldDocument fd = new FieldDocument();
291 fd.addTV(
292 "base",
293 "affe afffe baum baumgarten steingarten franz hans haus efeu effe",
margaretha4f995582015-12-14 14:14:34 +0100294 "[(0-4)s:affe|_0$<i>0<i>4|-:t$<i>10]"
295 + "[(5-10)s:afffe|_1$<i>5<i>10]"
296 + "[(11-15)s:baum|_2$<i>11<i>15]"
297 + "[(16-26)s:baumgarten|_3$<i>16<i>26]"
298 + "[(27-38)s:steingarten|_4$<i>27<i>38]"
299 + "[(39-44)s:franz|_5$<i>39<i>44]"
300 + "[(45-49)s:hans|_6$<i>45<i>49]"
301 + "[(50-54)s:haus|_7$<i>50<i>54]"
302 + "[(55-59)s:efeu|_8$<i>55<i>59]"
303 + "[(60-64)s:effe|_9$<i>60<i>64]");
Nils Diewaldbb33da22015-03-04 16:24:25 +0000304 ki.addDoc(fd);
Nils Diewaldea125202014-09-19 15:12:06 +0000305
Nils Diewaldbb33da22015-03-04 16:24:25 +0000306 ki.commit();
Nils Diewaldea125202014-09-19 15:12:06 +0000307
Nils Diewaldbb33da22015-03-04 16:24:25 +0000308 QueryBuilder kq = new QueryBuilder("base");
309 SpanQuery sq = kq.contains(
310 kq.seq(kq.re("s:a.*e")).append(kq.re("s:af*e")),
311 kq.seg("s:affe")).toQuery();
312 assertEquals(
313 "spanContain(spanNext(SpanMultiTermQueryWrapper(base:/s:a.*e/), SpanMultiTermQueryWrapper(base:/s:af*e/)), base:s:affe)",
314 sq.toString());
315 Krill ks = new Krill(sq);
316 ks.getMeta().getContext().left.setToken(true).setLength(1);
317 ks.getMeta().getContext().right.setToken(true).setLength(1);
Nils Diewaldea125202014-09-19 15:12:06 +0000318
Nils Diewaldbb33da22015-03-04 16:24:25 +0000319 Result kr = ki.search(ks);
320 assertEquals((long) 1, kr.getTotalResults());
321 assertEquals("[affe afffe] baum ...", kr.getMatch(0)
322 .getSnippetBrackets());
Nils Diewaldea125202014-09-19 15:12:06 +0000323 };
Nils Diewaldb0dd9552013-12-20 02:28:34 +0000324};