blob: ef34315fdc87096053ee0aaf15235b780579c592 [file] [log] [blame]
Eliza Margaretha01929182014-02-19 11:48:59 +00001package de.ids_mannheim.korap.index;
2
Nils Diewaldb0dd9552013-12-20 02:28:34 +00003import java.util.*;
4import java.io.*;
5
6import org.apache.lucene.util.Version;
7import org.apache.lucene.util.BytesRef;
8import org.apache.lucene.util.Bits;
9
10import static org.junit.Assert.*;
11import org.junit.Test;
12import org.junit.Ignore;
13import org.junit.runner.RunWith;
14import org.junit.runners.JUnit4;
15
16import de.ids_mannheim.korap.KorapIndex;
17import de.ids_mannheim.korap.KorapQuery;
18import de.ids_mannheim.korap.KorapResult;
19import de.ids_mannheim.korap.KorapSearch;
20import de.ids_mannheim.korap.index.FieldDocument;
Nils Diewald85f9c422015-02-06 21:09:16 +000021import de.ids_mannheim.korap.model.MultiTermTokenStream;
Nils Diewaldb0dd9552013-12-20 02:28:34 +000022import org.apache.lucene.search.Query;
23import org.apache.lucene.search.spans.SpanQuery;
24import org.apache.lucene.search.spans.SpanTermQuery;
25
26@RunWith(JUnit4.class)
27public class TestRegexWildcardIndex {
28
29 @Test
Nils Diewaldbe5943e2014-10-21 19:35:34 +000030 public void indexRegex () throws Exception {
Nils Diewaldb0dd9552013-12-20 02:28:34 +000031 KorapIndex ki = new KorapIndex();
32
33 // abcabcabac
34 FieldDocument fd = new FieldDocument();
35 fd.addTV("base",
36 "affe afffe baum baumgarten steingarten franz hans haus efeu effe",
37 "[(0-4)s:affe|_0#0-4|-:t$<i>10]" +
38 "[(5-10)s:afffe|_1#5-10]" +
39 "[(11-15)s:baum|_2#11-15]" +
40 "[(16-26)s:baumgarten|_3#16-26]" +
41 "[(27-38)s:steingarten|_4#27-38]" +
42 "[(39-44)s:franz|_5#39-44]" +
43 "[(45-49)s:hans|_6#45-49]" +
44 "[(50-54)s:haus|_7#50-54]" +
45 "[(55-59)s:efeu|_8#55-59]" +
46 "[(60-64)s:effe|_9#60-64]");
47 ki.addDoc(fd);
48
49 ki.commit();
50
51 KorapQuery kq = new KorapQuery("base");
52 SpanQuery sq = kq.re("s:af*e").toQuery();
53 assertEquals("SpanMultiTermQueryWrapper(base:/s:af*e/)", sq.toString());
Eliza Margaretha01929182014-02-19 11:48:59 +000054
Nils Diewaldb0dd9552013-12-20 02:28:34 +000055 KorapSearch ks = new KorapSearch(sq);
Nils Diewald1e5d5942014-05-20 13:29:53 +000056 ks.context.left.setToken(true).setLength(1);
57 ks.context.right.setToken(true).setLength(1);
Nils Diewaldb0dd9552013-12-20 02:28:34 +000058
59 KorapResult kr = ki.search(ks);
Nils Diewalde1ecd5e2014-11-27 02:17:24 +000060 assertEquals((long) 2, kr.getTotalResults());
Nils Diewaldb0dd9552013-12-20 02:28:34 +000061 assertEquals("[affe] afffe ...", kr.getMatch(0).getSnippetBrackets());
62 assertEquals("affe [afffe] baum ...", kr.getMatch(1).getSnippetBrackets());
63
64 kr = ki.search(ks.setQuery(new KorapQuery("base").re("s:baum.*").toQuery()));
Nils Diewalde1ecd5e2014-11-27 02:17:24 +000065 assertEquals((long) 2, kr.getTotalResults());
Nils Diewaldb0dd9552013-12-20 02:28:34 +000066 assertEquals("... afffe [baum] baumgarten ...", kr.getMatch(0).getSnippetBrackets());
67 assertEquals("... baum [baumgarten] steingarten ...", kr.getMatch(1).getSnippetBrackets());
68
69 kr = ki.search(ks.setQuery(new KorapQuery("base").re("s:.....?garten").toQuery()));
Nils Diewalde1ecd5e2014-11-27 02:17:24 +000070 assertEquals((long) 2, kr.getTotalResults());
Nils Diewaldb0dd9552013-12-20 02:28:34 +000071 assertEquals("... baum [baumgarten] steingarten ...", kr.getMatch(0).getSnippetBrackets());
72 assertEquals("... baumgarten [steingarten] franz ...", kr.getMatch(1).getSnippetBrackets());
73
74 kr = ki.search(ks.setQuery(new KorapQuery("base").re("s:ha.s").toQuery()));
Nils Diewalde1ecd5e2014-11-27 02:17:24 +000075 assertEquals((long) 2, kr.getTotalResults());
Nils Diewaldb0dd9552013-12-20 02:28:34 +000076 assertEquals("... franz [hans] haus ...", kr.getMatch(0).getSnippetBrackets());
77 assertEquals("... hans [haus] efeu ...", kr.getMatch(1).getSnippetBrackets());
78
79 kr = ki.search(ks.setQuery(new KorapQuery("base").re("s:.*ff.*").toQuery()));
Nils Diewalde1ecd5e2014-11-27 02:17:24 +000080 assertEquals((long) 3, kr.getTotalResults());
Nils Diewaldb0dd9552013-12-20 02:28:34 +000081 assertEquals("[affe] afffe ...", kr.getMatch(0).getSnippetBrackets());
82 assertEquals("affe [afffe] baum ...", kr.getMatch(1).getSnippetBrackets());
83 assertEquals("... efeu [effe]", kr.getMatch(2).getSnippetBrackets());
84 };
85
86 @Test
Nils Diewaldbe5943e2014-10-21 19:35:34 +000087 public void indexWildcard () throws Exception {
Nils Diewaldb0dd9552013-12-20 02:28:34 +000088 KorapIndex ki = new KorapIndex();
89
90 // abcabcabac
91 FieldDocument fd = new FieldDocument();
92 fd.addTV("base",
93 "affe afffe baum baumgarten steingarten franz hans haus efeu effe",
94 "[(0-4)s:affe|_0#0-4|-:t$<i>10]" +
95 "[(5-10)s:afffe|_1#5-10]" +
96 "[(11-15)s:baum|_2#11-15]" +
97 "[(16-26)s:baumgarten|_3#16-26]" +
98 "[(27-38)s:steingarten|_4#27-38]" +
99 "[(39-44)s:franz|_5#39-44]" +
100 "[(45-49)s:hans|_6#45-49]" +
101 "[(50-54)s:haus|_7#50-54]" +
102 "[(55-59)s:efeu|_8#55-59]" +
103 "[(60-64)s:effe|_9#60-64]");
104 ki.addDoc(fd);
105
106 ki.commit();
107
108 KorapQuery kq = new KorapQuery("base");
109 SpanQuery sq = kq.wc("s:af*e").toQuery();
110 assertEquals("SpanMultiTermQueryWrapper(base:s:af*e)", sq.toString());
111
112 KorapSearch ks = new KorapSearch(sq);
Nils Diewald1e5d5942014-05-20 13:29:53 +0000113 ks.context.left.setToken(true).setLength(1);
114 ks.context.right.setToken(true).setLength(1);
Nils Diewaldb0dd9552013-12-20 02:28:34 +0000115
116 KorapResult kr = ki.search(ks);
Nils Diewalde1ecd5e2014-11-27 02:17:24 +0000117 assertEquals((long) 2, kr.getTotalResults());
Nils Diewaldb0dd9552013-12-20 02:28:34 +0000118 assertEquals("[affe] afffe ...", kr.getMatch(0).getSnippetBrackets());
119 assertEquals("affe [afffe] baum ...", kr.getMatch(1).getSnippetBrackets());
120
121 kr = ki.search(ks.setQuery(new KorapQuery("base").wc("s:baum.*").toQuery()));
Nils Diewalde1ecd5e2014-11-27 02:17:24 +0000122 assertEquals((long) 0, kr.getTotalResults());
Nils Diewaldb0dd9552013-12-20 02:28:34 +0000123
124 kr = ki.search(ks.setQuery(new KorapQuery("base").wc("s:baum*").toQuery()));
Nils Diewalde1ecd5e2014-11-27 02:17:24 +0000125 assertEquals((long) 2, kr.getTotalResults());
Nils Diewaldb0dd9552013-12-20 02:28:34 +0000126 assertEquals("... afffe [baum] baumgarten ...", kr.getMatch(0).getSnippetBrackets());
127 assertEquals("... baum [baumgarten] steingarten ...", kr.getMatch(1).getSnippetBrackets());
128
129 kr = ki.search(ks.setQuery(new KorapQuery("base").wc("s:*garten").toQuery()));
Nils Diewalde1ecd5e2014-11-27 02:17:24 +0000130 assertEquals((long) 2, kr.getTotalResults());
Nils Diewaldb0dd9552013-12-20 02:28:34 +0000131 assertEquals("... baum [baumgarten] steingarten ...", kr.getMatch(0).getSnippetBrackets());
132 assertEquals("... baumgarten [steingarten] franz ...", kr.getMatch(1).getSnippetBrackets());
133
134 kr = ki.search(ks.setQuery(new KorapQuery("base").wc("s:ha?s").toQuery()));
Nils Diewalde1ecd5e2014-11-27 02:17:24 +0000135 assertEquals((long) 2, kr.getTotalResults());
Nils Diewaldb0dd9552013-12-20 02:28:34 +0000136 assertEquals("... franz [hans] haus ...", kr.getMatch(0).getSnippetBrackets());
137 assertEquals("... hans [haus] efeu ...", kr.getMatch(1).getSnippetBrackets());
138
139 kr = ki.search(ks.setQuery(new KorapQuery("base").wc("s:?ff?").toQuery()));
Nils Diewalde1ecd5e2014-11-27 02:17:24 +0000140 assertEquals((long) 2, kr.getTotalResults());
Nils Diewaldb0dd9552013-12-20 02:28:34 +0000141 assertEquals("[affe] afffe ...", kr.getMatch(0).getSnippetBrackets());
142 assertEquals("... efeu [effe]", kr.getMatch(1).getSnippetBrackets());
143 };
144
145 @Test
Nils Diewaldbe5943e2014-10-21 19:35:34 +0000146 public void indexRegexCaseInsensitive () throws Exception {
Nils Diewaldb0dd9552013-12-20 02:28:34 +0000147 KorapIndex ki = new KorapIndex();
148
149 // abcabcabac
150 FieldDocument fd = new FieldDocument();
151 fd.addTV("base",
152 "AfFe aFfFE Baum Baumgarten SteinGarten franZ HaNs Haus Efeu effe",
153 "[(0-4)s:AfFe|i:affe|_0#0-4|-:t$<i>10]" +
154 "[(5-10)s:aFfFE|i:afffe|_1#5-10]" +
155 "[(11-15)s:Baum|i:baum|_2#11-15]" +
156 "[(16-26)s:Baumgarten|i:baumgarten|_3#16-26]" +
157 "[(27-38)s:SteinGarten|i:steingarten|_4#27-38]" +
158 "[(39-44)s:franZ|i:franz|_5#39-44]" +
159 "[(45-49)s:HaNs|i:hans|_6#45-49]" +
160 "[(50-54)s:Haus|i:haus|_7#50-54]" +
161 "[(55-59)s:Efeu|i:efeu|_8#55-59]" +
162 "[(60-64)s:effe|i:effe|_9#60-64]");
163 ki.addDoc(fd);
164
165 ki.commit();
166
167 KorapQuery kq = new KorapQuery("base");
168 SpanQuery sq = kq.re("s:Af*e", true).toQuery();
169 assertEquals("SpanMultiTermQueryWrapper(base:/i:af*e/)", sq.toString());
170
171 KorapSearch ks = new KorapSearch(sq);
Nils Diewald1e5d5942014-05-20 13:29:53 +0000172 ks.context.left.setToken(true).setLength(1);
173 ks.context.right.setToken(true).setLength(1);
Nils Diewaldb0dd9552013-12-20 02:28:34 +0000174
175 KorapResult kr = ki.search(ks);
Nils Diewalde1ecd5e2014-11-27 02:17:24 +0000176 assertEquals((long) 2, kr.getTotalResults());
Nils Diewaldb0dd9552013-12-20 02:28:34 +0000177 assertEquals("[AfFe] aFfFE ...", kr.getMatch(0).getSnippetBrackets());
178 assertEquals("AfFe [aFfFE] Baum ...", kr.getMatch(1).getSnippetBrackets());
179
180 kr = ki.search(ks.setQuery(new KorapQuery("base").re("s:Af.*e").toQuery()));
Nils Diewalde1ecd5e2014-11-27 02:17:24 +0000181 assertEquals((long) 1, kr.getTotalResults());
Nils Diewaldb0dd9552013-12-20 02:28:34 +0000182 assertEquals("[AfFe] aFfFE ...", kr.getMatch(0).getSnippetBrackets());
183
184 kr = ki.search(ks.setQuery(new KorapQuery("base").re("s:baum.*", true).toQuery()));
Nils Diewalde1ecd5e2014-11-27 02:17:24 +0000185 assertEquals((long) 2, kr.getTotalResults());
Nils Diewaldb0dd9552013-12-20 02:28:34 +0000186 assertEquals("... aFfFE [Baum] Baumgarten ...", kr.getMatch(0).getSnippetBrackets());
187 assertEquals("... Baum [Baumgarten] SteinGarten ...", kr.getMatch(1).getSnippetBrackets());
188
189 kr = ki.search(ks.setQuery(new KorapQuery("base").re("s:.*garten", true).toQuery()));
Nils Diewalde1ecd5e2014-11-27 02:17:24 +0000190 assertEquals((long) 2, kr.getTotalResults());
Nils Diewaldb0dd9552013-12-20 02:28:34 +0000191 assertEquals("... Baum [Baumgarten] SteinGarten ...", kr.getMatch(0).getSnippetBrackets());
192 assertEquals("... Baumgarten [SteinGarten] franZ ...", kr.getMatch(1).getSnippetBrackets());
193
194 kr = ki.search(ks.setQuery(new KorapQuery("base").re("s:.*garten", false).toQuery()));
Nils Diewalde1ecd5e2014-11-27 02:17:24 +0000195 assertEquals((long) 1, kr.getTotalResults());
Nils Diewaldb0dd9552013-12-20 02:28:34 +0000196 assertEquals("... Baum [Baumgarten] SteinGarten ...", kr.getMatch(0).getSnippetBrackets());
197
198 kr = ki.search(ks.setQuery(new KorapQuery("base").re("s:ha.s", true).toQuery()));
Nils Diewalde1ecd5e2014-11-27 02:17:24 +0000199 assertEquals((long) 2, kr.getTotalResults());
Nils Diewaldb0dd9552013-12-20 02:28:34 +0000200 assertEquals("... franZ [HaNs] Haus ...", kr.getMatch(0).getSnippetBrackets());
201 assertEquals("... HaNs [Haus] Efeu ...", kr.getMatch(1).getSnippetBrackets());
202
203 kr = ki.search(ks.setQuery(new KorapQuery("base").re("s:.*f*e", true).toQuery()));
Nils Diewalde1ecd5e2014-11-27 02:17:24 +0000204 assertEquals((long) 3, kr.getTotalResults());
Nils Diewaldb0dd9552013-12-20 02:28:34 +0000205 assertEquals("[AfFe] aFfFE ...", kr.getMatch(0).getSnippetBrackets());
206 assertEquals("AfFe [aFfFE] Baum ...", kr.getMatch(1).getSnippetBrackets());
207 assertEquals("... Efeu [effe]", kr.getMatch(2).getSnippetBrackets());
208 };
209
210 @Test
Nils Diewaldbe5943e2014-10-21 19:35:34 +0000211 public void indexRegexCombined () throws Exception {
Nils Diewaldb0dd9552013-12-20 02:28:34 +0000212 KorapIndex ki = new KorapIndex();
213
214 // abcabcabac
215 FieldDocument fd = new FieldDocument();
216 fd.addTV("base",
217 "affe afffe baum baumgarten steingarten franz hans haus efeu effe",
218 "[(0-4)s:affe|_0#0-4|-:t$<i>10]" +
219 "[(5-10)s:afffe|_1#5-10]" +
220 "[(11-15)s:baum|_2#11-15]" +
221 "[(16-26)s:baumgarten|_3#16-26]" +
222 "[(27-38)s:steingarten|_4#27-38]" +
223 "[(39-44)s:franz|_5#39-44]" +
224 "[(45-49)s:hans|_6#45-49]" +
225 "[(50-54)s:haus|_7#50-54]" +
226 "[(55-59)s:efeu|_8#55-59]" +
227 "[(60-64)s:effe|_9#60-64]");
228 ki.addDoc(fd);
229
230 ki.commit();
231
232 KorapQuery kq = new KorapQuery("base");
233 SpanQuery sq = kq.seq(kq.seg("s:affe")).append(kq.re("s:af*e")).toQuery();
234 assertEquals("spanNext(base:s:affe, SpanMultiTermQueryWrapper(base:/s:af*e/))", sq.toString());
235
236 KorapSearch ks = new KorapSearch(sq);
Nils Diewald1e5d5942014-05-20 13:29:53 +0000237 ks.context.left.setToken(true).setLength(1);
238 ks.context.right.setToken(true).setLength(1);
Nils Diewaldb0dd9552013-12-20 02:28:34 +0000239
240 KorapResult kr = ki.search(ks);
Nils Diewalde1ecd5e2014-11-27 02:17:24 +0000241 assertEquals((long) 1, kr.getTotalResults());
Nils Diewaldb0dd9552013-12-20 02:28:34 +0000242 assertEquals("[affe afffe] baum ...", kr.getMatch(0).getSnippetBrackets());
243 };
Nils Diewaldb3a09db2013-12-21 00:22:02 +0000244
Nils Diewaldea125202014-09-19 15:12:06 +0000245
246 @Test
Nils Diewaldbe5943e2014-10-21 19:35:34 +0000247 public void indexRegexWithinRewrite () throws Exception {
Nils Diewaldea125202014-09-19 15:12:06 +0000248 KorapIndex ki = new KorapIndex();
249
250 // abcabcabac
251 FieldDocument fd = new FieldDocument();
252 fd.addTV("base",
253 "affe afffe baum baumgarten steingarten franz hans haus efeu effe",
254 "[(0-4)s:affe|_0#0-4|-:t$<i>10]" +
255 "[(5-10)s:afffe|_1#5-10]" +
256 "[(11-15)s:baum|_2#11-15]" +
257 "[(16-26)s:baumgarten|_3#16-26]" +
258 "[(27-38)s:steingarten|_4#27-38]" +
259 "[(39-44)s:franz|_5#39-44]" +
260 "[(45-49)s:hans|_6#45-49]" +
261 "[(50-54)s:haus|_7#50-54]" +
262 "[(55-59)s:efeu|_8#55-59]" +
263 "[(60-64)s:effe|_9#60-64]");
264 ki.addDoc(fd);
265
266 ki.commit();
267
268 KorapQuery kq = new KorapQuery("base");
269 SpanQuery sq = kq.contains(
270 kq.seq(
271 kq.re("s:a.*e")
272 ).append(
273 kq.re("s:af*e")
274 ),
275 kq.seg("s:affe")).toQuery();
276 assertEquals("spanContain(spanNext(SpanMultiTermQueryWrapper(base:/s:a.*e/), SpanMultiTermQueryWrapper(base:/s:af*e/)), base:s:affe)", sq.toString());
277 KorapSearch ks = new KorapSearch(sq);
278 ks.context.left.setToken(true).setLength(1);
279 ks.context.right.setToken(true).setLength(1);
280
281 KorapResult kr = ki.search(ks);
Nils Diewalde1ecd5e2014-11-27 02:17:24 +0000282 assertEquals((long) 1, kr.getTotalResults());
Nils Diewaldea125202014-09-19 15:12:06 +0000283 assertEquals("[affe afffe] baum ...", kr.getMatch(0).getSnippetBrackets());
284 };
Nils Diewaldb0dd9552013-12-20 02:28:34 +0000285};