| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 1 | import java.util.*; |
| 2 | import java.io.*; |
| 3 | |
| 4 | import org.apache.lucene.util.Version; |
| 5 | import org.apache.lucene.util.BytesRef; |
| 6 | import org.apache.lucene.util.Bits; |
| 7 | |
| 8 | import static org.junit.Assert.*; |
| 9 | import org.junit.Test; |
| 10 | import org.junit.Ignore; |
| 11 | import org.junit.runner.RunWith; |
| 12 | import org.junit.runners.JUnit4; |
| 13 | |
| 14 | import de.ids_mannheim.korap.KorapIndex; |
| 15 | import de.ids_mannheim.korap.KorapQuery; |
| 16 | import de.ids_mannheim.korap.KorapResult; |
| 17 | import de.ids_mannheim.korap.query.SpanNextQuery; |
| Nils Diewald | fdb94d8 | 2014-02-13 03:30:06 +0000 | [diff] [blame] | 18 | import de.ids_mannheim.korap.query.SpanMatchModifyClassQuery; |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 19 | import de.ids_mannheim.korap.query.SpanClassQuery; |
| 20 | import de.ids_mannheim.korap.index.FieldDocument; |
| 21 | import de.ids_mannheim.korap.analysis.MultiTermTokenStream; |
| 22 | |
| 23 | import org.apache.lucene.search.spans.SpanQuery; |
| Nils Diewald | f3b30ae | 2013-11-27 17:42:37 +0000 | [diff] [blame] | 24 | import org.apache.lucene.search.spans.SpanOrQuery; |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 25 | import org.apache.lucene.search.spans.SpanTermQuery; |
| 26 | import org.apache.lucene.index.Term; |
| 27 | |
| 28 | // mvn -Dtest=TestWithinIndex#indexExample1 test |
| 29 | |
| 30 | // match is shrink and split |
| 31 | |
| 32 | @RunWith(JUnit4.class) |
| 33 | public class TestMatchIndex { |
| 34 | |
| 35 | @Test |
| 36 | public void indexExample1 () throws IOException { |
| 37 | KorapIndex ki = new KorapIndex(); |
| 38 | |
| 39 | // abcabcabac |
| 40 | FieldDocument fd = new FieldDocument(); |
| 41 | fd.addTV("base", |
| 42 | "abcabcabac", |
| 43 | "[(0-1)s:a|i:a|_0#0-1|-:t$<i>10]" + |
| 44 | "[(1-2)s:b|i:b|_1#1-2]" + |
| 45 | "[(2-3)s:c|i:c|_2#2-3]" + |
| 46 | "[(3-4)s:a|i:a|_3#3-4]" + |
| 47 | "[(4-5)s:b|i:b|_4#4-5]" + |
| 48 | "[(5-6)s:c|i:c|_5#5-6]" + |
| 49 | "[(6-7)s:a|i:a|_6#6-7]" + |
| 50 | "[(7-8)s:b|i:b|_7#7-8]" + |
| 51 | "[(8-9)s:a|i:a|_8#8-9]" + |
| 52 | "[(9-10)s:c|i:c|_9#9-10]"); |
| 53 | ki.addDoc(fd); |
| 54 | |
| 55 | ki.commit(); |
| 56 | |
| 57 | SpanQuery sq; |
| 58 | KorapResult kr; |
| 59 | |
| 60 | sq = new SpanNextQuery( |
| 61 | new SpanTermQuery(new Term("base", "s:b")), |
| 62 | new SpanClassQuery( |
| 63 | new SpanTermQuery(new Term("base", "s:a")) |
| 64 | ) |
| 65 | ); |
| 66 | kr = ki.search(sq, (short) 10); |
| 67 | |
| 68 | assertEquals("totalResults", 1, kr.totalResults()); |
| 69 | assertEquals("StartPos (0)", 7, kr.match(0).startPos); |
| 70 | assertEquals("EndPos (0)", 9, kr.match(0).endPos); |
| 71 | assertEquals("SnippetBrackets (0)", "... bcabca[b{a}]c", kr.match(0).snippetBrackets()); |
| 72 | |
| Nils Diewald | 3caa00d | 2013-12-13 02:24:04 +0000 | [diff] [blame] | 73 | assertEquals("Test no 'more' context", "<span class=\"context-left\"><span class=\"more\"></span>bcabca</span><span class=\"match\">b<em class=\"class-0 level-0\">a</em></span><span class=\"context-right\">c</span>", kr.match(0).snippetHTML()); |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 74 | |
| Nils Diewald | fdb94d8 | 2014-02-13 03:30:06 +0000 | [diff] [blame] | 75 | sq = new SpanMatchModifyClassQuery( |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 76 | new SpanNextQuery( |
| 77 | new SpanTermQuery(new Term("base", "s:b")), |
| 78 | new SpanClassQuery( |
| 79 | new SpanTermQuery(new Term("base", "s:a")) |
| 80 | ) |
| 81 | ) |
| 82 | ); |
| 83 | kr = ki.search(sq, (short) 10); |
| 84 | |
| 85 | assertEquals("totalResults", 1, kr.totalResults()); |
| 86 | assertEquals("StartPos (0)", 8, kr.match(0).startPos); |
| 87 | assertEquals("EndPos (0)", 9, kr.match(0).endPos); |
| 88 | assertEquals("SnippetBrackets (0)", "... cabcab[a]c", kr.match(0).snippetBrackets()); |
| 89 | |
| Nils Diewald | fdb94d8 | 2014-02-13 03:30:06 +0000 | [diff] [blame] | 90 | sq = new SpanMatchModifyClassQuery( |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 91 | new SpanNextQuery( |
| 92 | new SpanClassQuery(new SpanTermQuery(new Term("base", "s:a")), (byte) 2), |
| 93 | new SpanClassQuery(new SpanTermQuery(new Term("base", "s:b")), (byte) 3) |
| 94 | ), (byte) 3 |
| 95 | ); |
| 96 | |
| 97 | kr = ki.search(sq, (short) 10); |
| 98 | |
| 99 | assertEquals("totalResults", 3, kr.totalResults()); |
| 100 | assertEquals("StartPos (0)", 1, kr.match(0).startPos); |
| 101 | assertEquals("EndPos (0)", 2, kr.match(0).endPos); |
| 102 | assertEquals("SnippetBrackets (0)", "a[b]cabcab ...", kr.match(0).snippetBrackets()); |
| Nils Diewald | f3b30ae | 2013-11-27 17:42:37 +0000 | [diff] [blame] | 103 | |
| Nils Diewald | 3caa00d | 2013-12-13 02:24:04 +0000 | [diff] [blame] | 104 | assertEquals("<span class=\"context-left\">a</span><span class=\"match\">b</span><span class=\"context-right\">cabcab<span class=\"more\"></span></span>", kr.match(0).snippetHTML()); |
| Nils Diewald | f3b30ae | 2013-11-27 17:42:37 +0000 | [diff] [blame] | 105 | |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 106 | assertEquals("StartPos (1)", 4, kr.match(1).startPos); |
| 107 | assertEquals("EndPos (1)", 5, kr.match(1).endPos); |
| 108 | assertEquals("SnippetBrackets (1)", "abca[b]cabac", kr.match(1).snippetBrackets()); |
| Nils Diewald | f3b30ae | 2013-11-27 17:42:37 +0000 | [diff] [blame] | 109 | |
| Nils Diewald | 3caa00d | 2013-12-13 02:24:04 +0000 | [diff] [blame] | 110 | assertEquals("<span class=\"context-left\">abca</span><span class=\"match\">b</span><span class=\"context-right\">cabac</span>", kr.match(1).snippetHTML()); |
| Nils Diewald | f3b30ae | 2013-11-27 17:42:37 +0000 | [diff] [blame] | 111 | |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 112 | assertEquals("StartPos (2)", 7, kr.match(2).startPos); |
| 113 | assertEquals("EndPos (2)", 8, kr.match(2).endPos); |
| 114 | assertEquals("SnippetBrackets (2)", "... bcabca[b]ac", kr.match(2).snippetBrackets()); |
| 115 | |
| 116 | |
| 117 | |
| 118 | // abcabcabac |
| Nils Diewald | fdb94d8 | 2014-02-13 03:30:06 +0000 | [diff] [blame] | 119 | sq = new SpanMatchModifyClassQuery( |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 120 | new SpanNextQuery( |
| 121 | new SpanTermQuery(new Term("base", "s:a")), |
| 122 | new SpanClassQuery( |
| 123 | new SpanNextQuery( |
| 124 | new SpanTermQuery(new Term("base", "s:b")), |
| 125 | new SpanClassQuery(new SpanTermQuery(new Term("base", "s:a"))) |
| 126 | ), (byte) 2 |
| 127 | )), (byte) 2); |
| 128 | |
| 129 | kr = ki.search(sq, (short) 10); |
| 130 | |
| 131 | // System.err.println(kr.toJSON()); |
| 132 | |
| 133 | assertEquals("totalResults", 1, kr.totalResults()); |
| 134 | assertEquals("SnippetBrackets (0)", "... bcabca[b{a}]c", kr.match(0).snippetBrackets()); |
| Nils Diewald | f3b30ae | 2013-11-27 17:42:37 +0000 | [diff] [blame] | 135 | |
| Nils Diewald | 3caa00d | 2013-12-13 02:24:04 +0000 | [diff] [blame] | 136 | assertEquals("SnippetHTML (0) 1", "<span class=\"context-left\"><span class=\"more\"></span>bcabca</span><span class=\"match\">b<em class=\"class-0 level-0\">a</em></span><span class=\"context-right\">c</span>", kr.match(0).snippetHTML()); |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 137 | |
| 138 | // Offset tokens |
| 139 | kr = ki.search(sq, 0, (short) 10, true, (short) 2, true, (short) 2); |
| 140 | assertEquals("totalResults", 1, kr.totalResults()); |
| 141 | assertEquals("SnippetBrackets (0)", "... ca[b{a}]c", kr.match(0).snippetBrackets()); |
| Nils Diewald | f3b30ae | 2013-11-27 17:42:37 +0000 | [diff] [blame] | 142 | |
| 143 | |
| 144 | |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 145 | // Offset Characters |
| 146 | kr = ki.search(sq, 0, (short) 10, false, (short) 1, false, (short) 0); |
| 147 | assertEquals("totalResults", 1, kr.totalResults()); |
| 148 | assertEquals("SnippetBrackets (0)", "... a[b{a}] ...", kr.match(0).snippetBrackets()); |
| 149 | |
| Nils Diewald | 3caa00d | 2013-12-13 02:24:04 +0000 | [diff] [blame] | 150 | assertEquals("SnippetHTML (0) 2", "<span class=\"context-left\"><span class=\"more\"></span>a</span><span class=\"match\">b<em class=\"class-0 level-0\">a</em></span><span class=\"context-right\"><span class=\"more\"></span></span>", kr.match(0).snippetHTML()); |
| Nils Diewald | f3b30ae | 2013-11-27 17:42:37 +0000 | [diff] [blame] | 151 | |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 152 | // System.err.println(kr.toJSON()); |
| 153 | |
| Nils Diewald | fdb94d8 | 2014-02-13 03:30:06 +0000 | [diff] [blame] | 154 | sq = new SpanMatchModifyClassQuery( |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 155 | new SpanNextQuery( |
| 156 | new SpanClassQuery(new SpanTermQuery(new Term("base", "s:b")), (byte) 1), |
| 157 | new SpanClassQuery(new SpanTermQuery(new Term("base", "s:c")), (byte) 2) |
| 158 | ), (byte) 3 |
| 159 | ); |
| 160 | |
| 161 | kr = ki.search(sq, (short) 10); |
| 162 | |
| 163 | assertEquals("totalResults", 2, kr.totalResults()); |
| 164 | assertEquals("StartPos (0)", 1, kr.match(0).startPos); |
| 165 | assertEquals("EndPos (0)", 3, kr.match(0).endPos); |
| 166 | assertEquals("SnippetBrackets (0)", "a[{1:b}{2:c}]abcaba ...", kr.match(0).snippetBrackets()); |
| 167 | assertEquals("StartPos (1)", 4, kr.match(1).startPos); |
| 168 | assertEquals("EndPos (1)", 6, kr.match(1).endPos); |
| 169 | assertEquals("SnippetBrackets (1)", "abca[{1:b}{2:c}]abac", kr.match(1).snippetBrackets()); |
| 170 | |
| Nils Diewald | baf68c5 | 2013-11-20 13:22:19 +0000 | [diff] [blame] | 171 | assertEquals("Document count", 1, ki.numberOf("base", "documents")); |
| 172 | assertEquals("Token count", 10, ki.numberOf("base", "t")); |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 173 | |
| 174 | |
| Nils Diewald | fdb94d8 | 2014-02-13 03:30:06 +0000 | [diff] [blame] | 175 | sq = new SpanMatchModifyClassQuery( |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 176 | new SpanNextQuery( |
| 177 | new SpanTermQuery(new Term("base", "s:a")), |
| 178 | new SpanClassQuery( |
| 179 | new SpanNextQuery( |
| 180 | new SpanTermQuery(new Term("base", "s:b")), |
| 181 | new SpanTermQuery(new Term("base", "s:c")) |
| 182 | ) |
| 183 | ) |
| 184 | ) |
| 185 | ); |
| 186 | |
| 187 | kr = ki.search(sq, (short) 2); |
| 188 | |
| 189 | assertEquals("totalResults", 2, kr.totalResults()); |
| 190 | assertEquals("StartPos (0)", 1, kr.match(0).startPos); |
| 191 | assertEquals("EndPos (0)", 3, kr.match(0).endPos); |
| 192 | assertEquals("SnippetBrackets (0)", "a[bc]abcaba ...", kr.match(0).snippetBrackets()); |
| 193 | assertEquals("StartPos (1)", 4, kr.match(1).startPos); |
| 194 | assertEquals("EndPos (1)", 6, kr.match(1).endPos); |
| 195 | assertEquals("SnippetBrackets (1)", "abca[bc]abac", kr.match(1).snippetBrackets()); |
| 196 | |
| Nils Diewald | baf68c5 | 2013-11-20 13:22:19 +0000 | [diff] [blame] | 197 | assertEquals(1, ki.numberOf("base", "documents")); |
| 198 | assertEquals(10, ki.numberOf("base", "t")); |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 199 | }; |
| Nils Diewald | f3b30ae | 2013-11-27 17:42:37 +0000 | [diff] [blame] | 200 | |
| 201 | |
| 202 | @Test |
| 203 | public void indexExample2 () throws IOException { |
| 204 | KorapIndex ki = new KorapIndex(); |
| 205 | |
| 206 | // abcabcabac |
| 207 | FieldDocument fd = new FieldDocument(); |
| 208 | fd.addTV("base", |
| 209 | "abcabcabac", |
| 210 | "[(0-1)s:a|i:a|_0#0-1|-:t$<i>10]" + |
| 211 | "[(1-2)s:b|i:b|_1#1-2]" + |
| 212 | "[(2-3)s:c|i:c|_2#2-3]" + |
| 213 | "[(3-4)s:a|i:a|_3#3-4]" + |
| 214 | "[(4-5)s:b|i:b|_4#4-5]" + |
| 215 | "[(5-6)s:c|i:c|_5#5-6]" + |
| 216 | "[(6-7)s:a|i:a|_6#6-7]" + |
| 217 | "[(7-8)s:b|i:b|_7#7-8]" + |
| 218 | "[(8-9)s:a|i:a|_8#8-9]" + |
| 219 | "[(9-10)s:c|i:c|_9#9-10]"); |
| 220 | ki.addDoc(fd); |
| 221 | |
| 222 | ki.commit(); |
| 223 | |
| 224 | SpanQuery sq; |
| 225 | KorapResult kr; |
| 226 | |
| 227 | // No contexts: |
| 228 | sq = new SpanOrQuery( |
| 229 | new SpanTermQuery(new Term("base", "s:a")), |
| 230 | new SpanTermQuery(new Term("base", "s:c")) |
| 231 | ); |
| 232 | kr = ki.search(sq, (short) 20); |
| 233 | |
| 234 | assertEquals("totalResults", 7, kr.totalResults()); |
| Nils Diewald | 3caa00d | 2013-12-13 02:24:04 +0000 | [diff] [blame] | 235 | assertEquals("SnippetBrackets (0)", "<span class=\"context-left\"></span><span class=\"match\">a</span><span class=\"context-right\">bcabca<span class=\"more\"></span></span>", kr.match(0).snippetHTML()); |
| Nils Diewald | f3b30ae | 2013-11-27 17:42:37 +0000 | [diff] [blame] | 236 | assertEquals("SnippetBrackets (0)", "[a]bcabca ...", kr.match(0).snippetBrackets()); |
| 237 | |
| 238 | assertEquals("SnippetBrackets (1)", "ab[c]abcaba ...", kr.match(1).snippetBrackets()); |
| Nils Diewald | 3caa00d | 2013-12-13 02:24:04 +0000 | [diff] [blame] | 239 | assertEquals("SnippetBrackets (1)", "<span class=\"context-left\">ab</span><span class=\"match\">c</span><span class=\"context-right\">abcaba<span class=\"more\"></span></span>", kr.match(1).snippetHTML()); |
| Nils Diewald | f3b30ae | 2013-11-27 17:42:37 +0000 | [diff] [blame] | 240 | |
| 241 | assertEquals("SnippetBrackets (6)", "... abcaba[c]", kr.match(6).snippetBrackets()); |
| Nils Diewald | 3caa00d | 2013-12-13 02:24:04 +0000 | [diff] [blame] | 242 | assertEquals("SnippetBrackets (6)", "<span class=\"context-left\"><span class=\"more\"></span>abcaba</span><span class=\"match\">c</span><span class=\"context-right\"></span>", kr.match(6).snippetHTML()); |
| Nils Diewald | f3b30ae | 2013-11-27 17:42:37 +0000 | [diff] [blame] | 243 | |
| 244 | |
| 245 | kr = ki.search(sq, 0, (short) 20, true, (short) 0, true, (short) 0); |
| 246 | |
| 247 | assertEquals("totalResults", 7, kr.totalResults()); |
| 248 | assertEquals("SnippetBrackets (0)", "[a] ...", kr.match(0).snippetBrackets()); |
| Nils Diewald | 3caa00d | 2013-12-13 02:24:04 +0000 | [diff] [blame] | 249 | assertEquals("SnippetHTML (0)", "<span class=\"context-left\"></span><span class=\"match\">a</span><span class=\"context-right\"><span class=\"more\"></span></span>", kr.match(0).snippetHTML()); |
| Nils Diewald | f3b30ae | 2013-11-27 17:42:37 +0000 | [diff] [blame] | 250 | |
| 251 | assertEquals("SnippetBrackets (1)", "... [c] ...", kr.match(1).snippetBrackets()); |
| Nils Diewald | 3caa00d | 2013-12-13 02:24:04 +0000 | [diff] [blame] | 252 | assertEquals("SnippetHTML (1)", "<span class=\"context-left\"><span class=\"more\"></span></span><span class=\"match\">c</span><span class=\"context-right\"><span class=\"more\"></span></span>", kr.match(1).snippetHTML()); |
| Nils Diewald | f3b30ae | 2013-11-27 17:42:37 +0000 | [diff] [blame] | 253 | |
| 254 | assertEquals("SnippetBrackets (6)", "... [c]", kr.match(6).snippetBrackets()); |
| Nils Diewald | 3caa00d | 2013-12-13 02:24:04 +0000 | [diff] [blame] | 255 | assertEquals("SnippetBrackets (6)", "<span class=\"context-left\"><span class=\"more\"></span></span><span class=\"match\">c</span><span class=\"context-right\"></span>", kr.match(6).snippetHTML()); |
| Nils Diewald | f3b30ae | 2013-11-27 17:42:37 +0000 | [diff] [blame] | 256 | }; |
| Nils Diewald | 3ef9a47 | 2013-12-02 16:06:09 +0000 | [diff] [blame] | 257 | |
| 258 | |
| 259 | @Test |
| 260 | public void indexExample3 () throws IOException { |
| 261 | KorapIndex ki = new KorapIndex(); |
| 262 | |
| 263 | // abcabcabac |
| 264 | FieldDocument fd = new FieldDocument(); |
| 265 | fd.addTV("base", |
| 266 | "abcabcabac", |
| 267 | "[(0-1)s:a|i:a|_0#0-1|-:t$<i>10]" + |
| 268 | "[(1-2)s:b|i:b|_1#1-2]" + |
| 269 | "[(2-3)s:c|i:c|_2#2-3]" + |
| 270 | "[(3-4)s:a|i:a|_3#3-4]" + |
| 271 | "[(4-5)s:b|i:b|_4#4-5]" + |
| 272 | "[(5-6)s:c|i:c|_5#5-6]" + |
| 273 | "[(6-7)s:a|i:a|_6#6-7]" + |
| 274 | "[(7-8)s:b|i:b|_7#7-8]" + |
| 275 | "[(8-9)s:a|i:a|_8#8-9]" + |
| 276 | "[(9-10)s:c|i:c|_9#9-10]"); |
| 277 | ki.addDoc(fd); |
| 278 | |
| 279 | ki.commit(); |
| 280 | |
| 281 | KorapResult kr; |
| 282 | |
| 283 | KorapQuery kq = new KorapQuery("base"); |
| 284 | |
| 285 | SpanQuery sq = kq._(1,kq.seq(kq.seg("s:b")).append(kq.seg("s:a")).append(kq._(2,kq.seg("s:c")))).toQuery(); |
| 286 | |
| 287 | kr = ki.search(sq, 0, (short) 20, true, (short) 2, true, (short) 5); |
| 288 | |
| 289 | assertEquals("totalResults", 1, kr.totalResults()); |
| 290 | assertEquals("SnippetBrackets (0)", "... ca[{1:ba{2:c}}]", kr.match(0).snippetBrackets()); |
| 291 | }; |
| Nils Diewald | f399a67 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 292 | }; |