| margaretha | c35e8a0 | 2017-09-11 16:34:20 +0200 | [diff] [blame] | 1 | package de.ids_mannheim.korap.index; |
| 2 | |
| 3 | import static org.junit.Assert.assertEquals; |
| 4 | |
| 5 | import java.io.IOException; |
| 6 | import java.util.ArrayList; |
| 7 | |
| 8 | import org.apache.lucene.index.Term; |
| 9 | import org.apache.lucene.search.WildcardQuery; |
| 10 | import org.apache.lucene.search.spans.SpanMultiTermQueryWrapper; |
| 11 | import org.apache.lucene.search.spans.SpanNearQuery; |
| 12 | import org.apache.lucene.search.spans.SpanQuery; |
| 13 | import org.apache.lucene.search.spans.SpanTermQuery; |
| 14 | import org.junit.Test; |
| 15 | |
| 16 | import de.ids_mannheim.korap.KrillIndex; |
| 17 | import de.ids_mannheim.korap.query.DistanceConstraint; |
| 18 | import de.ids_mannheim.korap.query.SpanClassQuery; |
| 19 | import de.ids_mannheim.korap.query.SpanMultipleDistanceQuery; |
| 20 | import de.ids_mannheim.korap.response.Result; |
| 21 | |
| 22 | public class TestWildcardIndex { |
| 23 | |
| 24 | private SpanTermQuery sq; |
| 25 | private KrillIndex ki; |
| 26 | private Result kr; |
| 27 | private ArrayList<DistanceConstraint> constraints; |
| 28 | |
| 29 | |
| 30 | public TestWildcardIndex () { |
| 31 | // &Erfahrung |
| 32 | sq = new SpanTermQuery(new Term("tokens", "tt/l:Erfahrung")); |
| 33 | |
| 34 | // /+w1:2,s0 |
| 35 | constraints = new ArrayList<DistanceConstraint>(); |
| 36 | constraints.add(TestMultipleDistanceIndex.createConstraint("w", 1, 2, |
| 37 | true, false)); |
| 38 | constraints.add(TestMultipleDistanceIndex.createConstraint("tokens", |
| 39 | "base/s:s", 0, 0, true, false)); |
| 40 | } |
| 41 | |
| 42 | |
| 43 | private FieldDocument createFieldDoc1 () { |
| 44 | FieldDocument fd = new FieldDocument(); |
| 45 | fd.addString("ID", "doc-1"); |
| 46 | fd.addTV("tokens", "text", |
| 47 | "[(0-1)s:meine|_1$<i>0<i>1|<>:base/s:s$<b>64<i>0<i>9<i>10<b>0]" |
| 48 | + "[(1-2)tt/l:Erfahrung|_2$<i>1<i>2]" |
| 49 | + "[(2-3)s:meiner|_3$<i>2<i>3]" |
| 50 | + "[(3-4)tt/l:Erfahrung|_4$<i>3<i>4]" |
| 51 | + "[(4-5)s:mein|_5$<i>4<i>5]" |
| 52 | + "[(5-6)tt/l:Erfahrung|_6$<i>5<i>6]" |
| 53 | + "[(6-7)s:meinem|_7$<i>6<i>7]" |
| 54 | + "[(7-8)tt/l:Erfahrung|_8$<i>7<i>8]" |
| 55 | + "[(8-9)s:meinen|_9$<i>8<i>9]" |
| 56 | + "[(9-10)tt/l:Erfahrung|_10$<i>9<i>10]"); |
| 57 | return fd; |
| 58 | } |
| 59 | |
| 60 | |
| 61 | @Test |
| 62 | public void testWildcardStarWithCollection () throws IOException { |
| 63 | ki = new KrillIndex(); |
| 64 | ki.addDoc(createFieldDoc1()); |
| 65 | ki.commit(); |
| 66 | // meine* |
| 67 | WildcardQuery wcquery = |
| 68 | new WildcardQuery(new Term("tokens", "s:meine*")); |
| 69 | SpanMultiTermQueryWrapper<WildcardQuery> mtq = |
| 70 | new SpanMultiTermQueryWrapper<WildcardQuery>(wcquery); |
| 71 | |
| 72 | // meine* /+w1:2,s0 &Erfahrung |
| 73 | SpanQuery mdsq = new SpanMultipleDistanceQuery( |
| 74 | new SpanClassQuery(mtq, (byte) 129), |
| 75 | new SpanClassQuery(sq, (byte) 129), constraints, true, true); |
| 76 | |
| 77 | kr = ki.search(mdsq, (short) 10); |
| 78 | assertEquals(4, kr.getMatches().size()); |
| 79 | } |
| 80 | |
| 81 | |
| 82 | @Test |
| 83 | public void testWildcardQuestionMark1 () throws IOException { |
| 84 | ki = new KrillIndex(); |
| 85 | ki.addDoc(createFieldDoc1()); |
| 86 | ki.commit(); |
| 87 | |
| 88 | // Wildcard ? means regex . (expects exactly one character) |
| 89 | SpanMultiTermQueryWrapper<WildcardQuery> mtq = |
| 90 | new SpanMultiTermQueryWrapper<WildcardQuery>( |
| 91 | new WildcardQuery(new Term("tokens", "s:meine?"))); |
| 92 | SpanMultipleDistanceQuery mdsq = new SpanMultipleDistanceQuery( |
| 93 | new SpanClassQuery(mtq, (byte) 129), |
| 94 | new SpanClassQuery(sq, (byte) 129), constraints, true, true); |
| 95 | |
| 96 | kr = ki.search(mdsq, (short) 10); |
| 97 | assertEquals(3, kr.getMatches().size()); |
| 98 | |
| 99 | } |
| 100 | |
| 101 | |
| 102 | @Test |
| 103 | public void testWildcardQuestionMark2 () throws IOException { |
| 104 | ki = new KrillIndex(); |
| 105 | ki.addDoc(createFieldDoc1()); |
| 106 | ki.commit(); |
| 107 | |
| 108 | // Wildcard ? means regex . (expects exactly one character) |
| 109 | SpanMultiTermQueryWrapper<WildcardQuery> mtq = |
| 110 | new SpanMultiTermQueryWrapper<WildcardQuery>( |
| 111 | new WildcardQuery(new Term("tokens", "s:mein?"))); |
| 112 | SpanMultipleDistanceQuery mdsq = new SpanMultipleDistanceQuery( |
| 113 | new SpanClassQuery(mtq, (byte) 129), |
| 114 | new SpanClassQuery(sq, (byte) 129), constraints, true, true); |
| 115 | |
| 116 | kr = ki.search(mdsq, (short) 10); |
| 117 | assertEquals(1, kr.getMatches().size()); |
| 118 | |
| 119 | } |
| 120 | |
| 121 | |
| 122 | @Test |
| 123 | public void testWildcardPlusWithCollection () throws IOException { |
| 124 | ki = new KrillIndex(); |
| 125 | ki.addDoc(createFieldDoc1()); |
| 126 | ki.commit(); |
| 127 | // mein+ /+w1:2,s0 &Erfahrung |
| 128 | SpanMultiTermQueryWrapper<WildcardQuery> mtq = |
| 129 | new SpanMultiTermQueryWrapper<WildcardQuery>( |
| 130 | new WildcardQuery(new Term("tokens", "s:mein+"))); |
| 131 | |
| 132 | |
| 133 | // Just to make sure, Lucene internal queries treat SpanOr([]) correctly |
| 134 | SpanQuery soq = new SpanNearQuery(new SpanQuery[] { mtq, sq }, 1, true); |
| 135 | kr = ki.search(soq, (short) 10); |
| 136 | // As described in http://korap.github.io/Koral/, '+' is not a valid wildcard |
| 137 | assertEquals(0, kr.getMatches().size()); |
| 138 | |
| 139 | |
| 140 | |
| 141 | // Check the reported classed query |
| 142 | SpanMultipleDistanceQuery mdsq = new SpanMultipleDistanceQuery( |
| 143 | new SpanClassQuery(mtq, (byte) 129), |
| 144 | new SpanClassQuery(sq, (byte) 129), constraints, true, true); |
| 145 | |
| 146 | kr = ki.search(mdsq, (short) 10); |
| 147 | assertEquals(0, kr.getMatches().size()); |
| 148 | |
| 149 | |
| 150 | // Check multiple distance query |
| 151 | mdsq = new SpanMultipleDistanceQuery(mtq, sq, constraints, true, true); |
| 152 | |
| 153 | kr = ki.search(mdsq, (short) 10); |
| 154 | assertEquals(0, kr.getMatches().size()); |
| 155 | } |
| 156 | } |