| Eliza Margaretha | d28469f | 2014-03-10 12:42:21 +0000 | [diff] [blame] | 1 | package de.ids_mannheim.korap.index; |
| 2 | |
| 3 | import static org.junit.Assert.assertEquals; |
| 4 | |
| 5 | import java.io.File; |
| 6 | import java.io.IOException; |
| 7 | import java.io.InputStream; |
| 8 | import java.util.Properties; |
| 9 | |
| 10 | import org.apache.lucene.index.Term; |
| 11 | import org.apache.lucene.search.spans.SpanQuery; |
| 12 | import org.apache.lucene.search.spans.SpanTermQuery; |
| 13 | import org.apache.lucene.store.MMapDirectory; |
| 14 | import org.junit.Test; |
| 15 | |
| Nils Diewald | 2d5f810 | 2015-02-26 21:07:54 +0000 | [diff] [blame] | 16 | import de.ids_mannheim.korap.KrillCollection; |
| Nils Diewald | a14ecd6 | 2015-02-26 21:00:20 +0000 | [diff] [blame] | 17 | import de.ids_mannheim.korap.KrillIndex; |
| Nils Diewald | 392bcf3 | 2015-02-26 20:01:17 +0000 | [diff] [blame] | 18 | import de.ids_mannheim.korap.response.Match; |
| Nils Diewald | 884dbcf | 2015-02-27 17:02:28 +0000 | [diff] [blame] | 19 | import de.ids_mannheim.korap.response.Result; |
| Nils Diewald | bbd39a5 | 2015-02-23 19:56:57 +0000 | [diff] [blame] | 20 | import de.ids_mannheim.korap.Krill; |
| Nils Diewald | 01ff7af | 2015-02-04 22:54:26 +0000 | [diff] [blame] | 21 | import de.ids_mannheim.korap.collection.BooleanFilter; |
| Eliza Margaretha | d469346 | 2014-03-17 13:16:18 +0000 | [diff] [blame] | 22 | import de.ids_mannheim.korap.query.DistanceConstraint; |
| Eliza Margaretha | d28469f | 2014-03-10 12:42:21 +0000 | [diff] [blame] | 23 | import de.ids_mannheim.korap.query.SpanDistanceQuery; |
| 24 | import de.ids_mannheim.korap.query.SpanElementQuery; |
| 25 | import de.ids_mannheim.korap.query.SpanNextQuery; |
| Eliza Margaretha | d469346 | 2014-03-17 13:16:18 +0000 | [diff] [blame] | 26 | import de.ids_mannheim.korap.query.SpanRepetitionQuery; |
| Eliza Margaretha | d28469f | 2014-03-10 12:42:21 +0000 | [diff] [blame] | 27 | |
| 28 | public class TestWPDIndex { |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 29 | long start, end; |
| 30 | KrillIndex ki; |
| 31 | Result kr; |
| 32 | Krill ks; |
| 33 | |
| 34 | |
| 35 | private SpanDistanceQuery createElementDistanceQuery (String e, String x, |
| 36 | String y, int min, int max, boolean isOrdered, boolean exclusion) { |
| 37 | SpanElementQuery eq = new SpanElementQuery("tokens", e); |
| 38 | SpanDistanceQuery sq = new SpanDistanceQuery(new SpanTermQuery( |
| 39 | new Term("tokens", x)), |
| 40 | new SpanTermQuery(new Term("tokens", y)), |
| 41 | new DistanceConstraint(eq, min, max, isOrdered, exclusion), |
| 42 | true); |
| 43 | return sq; |
| Eliza Margaretha | d28469f | 2014-03-10 12:42:21 +0000 | [diff] [blame] | 44 | } |
| Eliza Margaretha | d28469f | 2014-03-10 12:42:21 +0000 | [diff] [blame] | 45 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 46 | |
| 47 | private SpanDistanceQuery createDistanceQuery (String x, String y, int min, |
| 48 | int max, boolean isOrdered, boolean exclusion) { |
| 49 | SpanDistanceQuery sq = new SpanDistanceQuery(new SpanTermQuery( |
| 50 | new Term("tokens", x)), |
| 51 | new SpanTermQuery(new Term("tokens", y)), |
| 52 | new DistanceConstraint(min, max, isOrdered, exclusion), true); |
| 53 | return sq; |
| 54 | } |
| 55 | |
| 56 | |
| 57 | public TestWPDIndex () throws IOException { |
| 58 | InputStream is = getClass().getResourceAsStream("/korap.conf"); |
| 59 | Properties prop = new Properties(); |
| 60 | prop.load(is); |
| 61 | |
| 62 | String indexPath = prop.getProperty("lucene.indexDir"); |
| 63 | MMapDirectory md = new MMapDirectory(new File(indexPath)); |
| 64 | ki = new KrillIndex(md); |
| 65 | } |
| 66 | |
| 67 | |
| 68 | /** Token distance spans */ |
| 69 | @Test |
| 70 | public void testCase1 () throws IOException { |
| 71 | SpanDistanceQuery sq; |
| 72 | // ordered |
| 73 | sq = createDistanceQuery("s:Wir", "s:kommen", 1, 1, true, false); |
| 74 | ks = new Krill(sq); |
| 75 | kr = ks.apply(ki); |
| 76 | assertEquals(kr.getTotalResults(), 8); |
| 77 | |
| 78 | // unordered |
| 79 | sq = createDistanceQuery("s:Wir", "s:kommen", 1, 1, false, false); |
| 80 | ks = new Krill(sq); |
| 81 | kr = ks.apply(ki); |
| 82 | assertEquals(kr.getTotalResults(), 11); |
| 83 | |
| 84 | sq = createDistanceQuery("s:kommen", "s:Wir", 1, 1, false, false); |
| 85 | ks = new Krill(sq); |
| 86 | kr = ks.apply(ki); |
| 87 | assertEquals(kr.getTotalResults(), 11); |
| 88 | //System.out.println(kr.getTotalResults()); |
| 89 | //for (Match km : kr.getMatches()){ |
| 90 | //System.out.println(km.getDocID() +" "+km.getStartPos() +" "+ km.getEndPos()); |
| 91 | //System.out.println(km.getSnippetBrackets()); |
| 92 | //System.out.println(km.toJSON()); |
| Eliza Margaretha | 7788a98 | 2014-08-29 16:10:52 +0000 | [diff] [blame] | 93 | //} |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 94 | } |
| Eliza Margaretha | d28469f | 2014-03-10 12:42:21 +0000 | [diff] [blame] | 95 | |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 96 | |
| 97 | /** Token exclusion distance spans */ |
| 98 | @Test |
| 99 | public void testCase2 () throws IOException { |
| 100 | |
| 101 | SpanQuery q = new SpanTermQuery(new Term("tokens", "s:Wir")); |
| 102 | ks = new Krill(q); |
| 103 | kr = ks.apply(ki); |
| 104 | assertEquals(kr.getTotalResults(), 1907); |
| 105 | |
| 106 | SpanDistanceQuery sq; |
| 107 | // ordered |
| 108 | sq = createDistanceQuery("s:Wir", "s:kommen", 1, 1, true, true); |
| 109 | ks = new Krill(sq); |
| 110 | kr = ks.apply(ki); |
| 111 | assertEquals(kr.getTotalResults(), 1899); |
| 112 | |
| 113 | // unordered |
| 114 | sq = createDistanceQuery("s:Wir", "s:kommen", 1, 1, false, true); |
| 115 | ks = new Krill(sq); |
| 116 | kr = ks.apply(ki); |
| 117 | assertEquals(kr.getTotalResults(), 1896); |
| 118 | } |
| 119 | |
| 120 | |
| 121 | /** Element distance spans */ |
| 122 | @Test |
| 123 | public void testCase3 () throws IOException { |
| 124 | // ordered |
| 125 | SpanDistanceQuery sq = createElementDistanceQuery("s", "s:weg", |
| 126 | "s:fahren", 0, 1, true, false); |
| 127 | ks = new Krill(sq); |
| 128 | kr = ks.apply(ki); |
| 129 | assertEquals(kr.getTotalResults(), 3); |
| 130 | |
| 131 | // unordered |
| 132 | sq = createElementDistanceQuery("s", "s:weg", "s:fahren", 0, 1, false, |
| 133 | false); |
| 134 | ks = new Krill(sq); |
| 135 | kr = ks.apply(ki); |
| 136 | assertEquals(kr.getTotalResults(), 5); |
| 137 | |
| 138 | // only 0 |
| 139 | sq = createElementDistanceQuery("s", "s:weg", "s:fahren", 0, 0, false, |
| 140 | false); |
| 141 | kr = ki.search(sq, (short) 100); |
| 142 | assertEquals(kr.getTotalResults(), 2); |
| 143 | assertEquals("WPD_BBB.04463", kr.getMatch(0).getDocID()); |
| 144 | assertEquals(1094, kr.getMatch(0).getStartPos()); |
| 145 | assertEquals(1115, kr.getMatch(0).getEndPos()); |
| 146 | assertEquals("WPD_III.00758", kr.getMatch(1).getDocID()); |
| 147 | assertEquals(444, kr.getMatch(1).getStartPos()); |
| 148 | assertEquals(451, kr.getMatch(1).getEndPos()); |
| 149 | |
| 150 | // only 1 |
| 151 | sq = createElementDistanceQuery("s", "s:weg", "s:fahren", 1, 1, false, |
| 152 | false); |
| 153 | ks = new Krill(sq); |
| 154 | kr = ks.apply(ki); |
| 155 | assertEquals(kr.getTotalResults(), 3); |
| 156 | } |
| 157 | |
| 158 | |
| 159 | /** Element distance exclusion */ |
| 160 | @Test |
| 161 | public void testCase4 () throws IOException { |
| 162 | SpanDistanceQuery sq = createElementDistanceQuery("s", "s:weg", |
| 163 | "s:fahren", 1, 1, false, true); |
| 164 | ks = new Krill(sq); |
| 165 | kr = ks.apply(ki); |
| 166 | assertEquals(kr.getTotalResults(), 979); |
| 167 | //0.8s |
| 168 | |
| 169 | // Check if it includes some results |
| 170 | BooleanFilter bf = new BooleanFilter(); |
| 171 | bf.or("ID", "WPD_BBB.04463", "WPD_III.00758"); |
| 172 | KrillCollection kc = new KrillCollection(); |
| 173 | kc.filter(bf); |
| 174 | ks.setCollection(kc); |
| 175 | kr = ks.apply(ki); |
| 176 | assertEquals(1094, kr.getMatch(0).getStartPos()); |
| 177 | assertEquals(451, kr.getMatch(1).getEndPos()); |
| 178 | } |
| 179 | |
| 180 | |
| 181 | /** Repetition */ |
| 182 | @Test |
| 183 | public void testCase5 () throws IOException { |
| 184 | SpanQuery sq; |
| 185 | sq = new SpanRepetitionQuery(new SpanTermQuery(new Term("tokens", |
| 186 | "mate/p:ADJA")), 1, 2, true); |
| 187 | ks = new Krill(sq); |
| 188 | kr = ks.apply(ki); |
| 189 | assertEquals(kr.getTotalResults(), 4116416); |
| 190 | //0.9s |
| 191 | |
| 192 | sq = new SpanRepetitionQuery(new SpanTermQuery(new Term("tokens", |
| 193 | "mate/p:ADJA")), 1, 1, true); |
| 194 | ks = new Krill(sq); |
| 195 | kr = ks.apply(ki); |
| 196 | assertEquals(kr.getTotalResults(), 3879671); |
| 197 | |
| 198 | sq = new SpanRepetitionQuery(new SpanTermQuery(new Term("tokens", |
| 199 | "mate/p:ADJA")), 2, 2, true); |
| 200 | ks = new Krill(sq); |
| 201 | kr = ks.apply(ki); |
| 202 | assertEquals(kr.getTotalResults(), 236745); |
| 203 | //0.65s |
| 204 | } |
| 205 | |
| 206 | |
| 207 | /** Next and repetition */ |
| 208 | @Test |
| 209 | public void testCase6 () throws IOException { |
| 210 | SpanQuery sq = new SpanNextQuery(new SpanTermQuery(new Term("tokens", |
| 211 | "tt/p:NN")), new SpanRepetitionQuery(new SpanTermQuery( |
| 212 | new Term("tokens", "mate/p:ADJA")), 2, 2, true)); |
| 213 | ks = new Krill(sq); |
| 214 | kr = ks.apply(ki); |
| 215 | assertEquals(kr.getTotalResults(), 30223); |
| 216 | // 1.1s |
| 217 | |
| 218 | SpanQuery sq2 = new SpanNextQuery(sq, new SpanTermQuery(new Term( |
| 219 | "tokens", "tt/p:NN"))); |
| 220 | ks = new Krill(sq2); |
| 221 | kr = ks.apply(ki); |
| 222 | assertEquals(kr.getTotalResults(), 26607); |
| 223 | // 1.1s |
| 224 | } |
| Eliza Margaretha | d28469f | 2014-03-10 12:42:21 +0000 | [diff] [blame] | 225 | } |