| Akron | 3ba74f2 | 2015-07-24 18:46:17 +0200 | [diff] [blame] | 1 | package de.ids_mannheim.korap.collection; |
| Akron | 4055017 | 2015-08-04 03:06:12 +0200 | [diff] [blame] | 2 | |
| Akron | 3ba74f2 | 2015-07-24 18:46:17 +0200 | [diff] [blame] | 3 | import java.io.IOException; |
| 4 | |
| 5 | import de.ids_mannheim.korap.KrillIndex; |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 6 | import de.ids_mannheim.korap.KrillCollection; |
| 7 | import de.ids_mannheim.korap.collection.CollectionBuilder; |
| Akron | 3ba74f2 | 2015-07-24 18:46:17 +0200 | [diff] [blame] | 8 | import de.ids_mannheim.korap.index.FieldDocument; |
| Akron | 1d63f27 | 2015-07-28 12:19:49 +0200 | [diff] [blame] | 9 | import de.ids_mannheim.korap.index.TextAnalyzer; |
| Akron | fd05f50 | 2015-07-30 18:34:26 +0200 | [diff] [blame] | 10 | import de.ids_mannheim.korap.response.Result; |
| 11 | import de.ids_mannheim.korap.KrillQuery; |
| 12 | import de.ids_mannheim.korap.query.QueryBuilder; |
| 13 | |
| Akron | 1d63f27 | 2015-07-28 12:19:49 +0200 | [diff] [blame] | 14 | import org.apache.lucene.analysis.Analyzer; |
| 15 | import org.apache.lucene.analysis.TokenStream; |
| 16 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; |
| Akron | fd05f50 | 2015-07-30 18:34:26 +0200 | [diff] [blame] | 17 | import org.apache.lucene.index.Term; |
| 18 | import org.apache.lucene.search.spans.SpanOrQuery; |
| 19 | import org.apache.lucene.search.spans.SpanQuery; |
| 20 | import org.apache.lucene.search.spans.SpanTermQuery; |
| 21 | import org.apache.lucene.search.spans.SpanQuery; |
| 22 | |
| Akron | 3ba74f2 | 2015-07-24 18:46:17 +0200 | [diff] [blame] | 23 | import static org.junit.Assert.*; |
| 24 | import org.junit.Test; |
| 25 | import org.junit.Ignore; |
| 26 | import org.junit.runner.RunWith; |
| 27 | import org.junit.runners.JUnit4; |
| 28 | |
| 29 | @RunWith(JUnit4.class) |
| 30 | public class TestKrillCollectionIndex { |
| 31 | private KrillIndex ki; |
| 32 | |
| Akron | 4055017 | 2015-08-04 03:06:12 +0200 | [diff] [blame] | 33 | |
| Akron | 3ba74f2 | 2015-07-24 18:46:17 +0200 | [diff] [blame] | 34 | @Test |
| 35 | public void testIndexWithCollectionBuilder () throws IOException { |
| 36 | ki = new KrillIndex(); |
| 37 | ki.addDoc(createDoc1()); |
| 38 | ki.addDoc(createDoc2()); |
| 39 | ki.addDoc(createDoc3()); |
| 40 | ki.commit(); |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 41 | CollectionBuilder cb = new CollectionBuilder(); |
| 42 | KrillCollection kcn = new KrillCollection(ki); |
| Akron | 3ba74f2 | 2015-07-24 18:46:17 +0200 | [diff] [blame] | 43 | |
| 44 | // Simple string tests |
| 45 | kcn.fromBuilder(cb.term("author", "Frank")); |
| 46 | assertEquals(1, kcn.docCount()); |
| 47 | |
| 48 | kcn.fromBuilder(cb.term("author", "Peter")); |
| 49 | assertEquals(1, kcn.docCount()); |
| 50 | |
| 51 | kcn.fromBuilder(cb.term("author", "Sebastian")); |
| 52 | assertEquals(1, kcn.docCount()); |
| 53 | |
| 54 | kcn.fromBuilder(cb.term("author", "Michael")); |
| 55 | assertEquals(0, kcn.docCount()); |
| 56 | |
| 57 | kcn.fromBuilder(cb.term("textClass", "reisen")); |
| 58 | assertEquals(3, kcn.docCount()); |
| 59 | |
| 60 | kcn.fromBuilder(cb.term("textClass", "kultur")); |
| 61 | assertEquals(2, kcn.docCount()); |
| 62 | |
| 63 | kcn.fromBuilder(cb.term("textClass", "finanzen")); |
| 64 | assertEquals(1, kcn.docCount()); |
| 65 | |
| 66 | // Simple orGroup tests |
| Akron | 4055017 | 2015-08-04 03:06:12 +0200 | [diff] [blame] | 67 | kcn.fromBuilder(cb.orGroup().with(cb.term("author", "Frank")) |
| 68 | .with(cb.term("author", "Michael"))); |
| Akron | 3ba74f2 | 2015-07-24 18:46:17 +0200 | [diff] [blame] | 69 | assertEquals(1, kcn.docCount()); |
| 70 | |
| Akron | 4055017 | 2015-08-04 03:06:12 +0200 | [diff] [blame] | 71 | kcn.fromBuilder(cb.orGroup().with(cb.term("author", "Frank")) |
| 72 | .with(cb.term("author", "Sebastian"))); |
| Akron | 3ba74f2 | 2015-07-24 18:46:17 +0200 | [diff] [blame] | 73 | assertEquals(2, kcn.docCount()); |
| 74 | |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 75 | kcn.fromBuilder(cb.orGroup().with(cb.term("author", "Frank")) |
| Akron | 4055017 | 2015-08-04 03:06:12 +0200 | [diff] [blame] | 76 | .with(cb.term("author", "Sebastian")) |
| 77 | .with(cb.term("author", "Peter"))); |
| Akron | 3ba74f2 | 2015-07-24 18:46:17 +0200 | [diff] [blame] | 78 | assertEquals(3, kcn.docCount()); |
| 79 | |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 80 | kcn.fromBuilder(cb.orGroup().with(cb.term("author", "Huhu")) |
| Akron | 4055017 | 2015-08-04 03:06:12 +0200 | [diff] [blame] | 81 | .with(cb.term("author", "Haha")) |
| 82 | .with(cb.term("author", "Hehe"))); |
| Akron | 3ba74f2 | 2015-07-24 18:46:17 +0200 | [diff] [blame] | 83 | assertEquals(0, kcn.docCount()); |
| 84 | |
| 85 | // Multi field orGroup tests |
| Akron | 4055017 | 2015-08-04 03:06:12 +0200 | [diff] [blame] | 86 | kcn.fromBuilder(cb.orGroup().with(cb.term("ID", "doc-1")) |
| 87 | .with(cb.term("author", "Peter"))); |
| Akron | 3ba74f2 | 2015-07-24 18:46:17 +0200 | [diff] [blame] | 88 | assertEquals(2, kcn.docCount()); |
| 89 | |
| Akron | 4055017 | 2015-08-04 03:06:12 +0200 | [diff] [blame] | 90 | kcn.fromBuilder(cb.orGroup().with(cb.term("ID", "doc-1")) |
| 91 | .with(cb.term("author", "Frank"))); |
| Akron | 3ba74f2 | 2015-07-24 18:46:17 +0200 | [diff] [blame] | 92 | assertEquals(1, kcn.docCount()); |
| 93 | |
| Akron | 4055017 | 2015-08-04 03:06:12 +0200 | [diff] [blame] | 94 | kcn.fromBuilder(cb.orGroup().with(cb.term("ID", "doc-1")) |
| 95 | .with(cb.term("author", "Michael"))); |
| Akron | 3ba74f2 | 2015-07-24 18:46:17 +0200 | [diff] [blame] | 96 | assertEquals(1, kcn.docCount()); |
| 97 | |
| 98 | // Simple andGroup tests |
| Akron | 4055017 | 2015-08-04 03:06:12 +0200 | [diff] [blame] | 99 | kcn.fromBuilder(cb.andGroup().with(cb.term("author", "Frank")) |
| 100 | .with(cb.term("author", "Michael"))); |
| Akron | 3ba74f2 | 2015-07-24 18:46:17 +0200 | [diff] [blame] | 101 | assertEquals(0, kcn.docCount()); |
| 102 | |
| Akron | 4055017 | 2015-08-04 03:06:12 +0200 | [diff] [blame] | 103 | kcn.fromBuilder(cb.andGroup().with(cb.term("ID", "doc-1")) |
| 104 | .with(cb.term("author", "Frank"))); |
| Akron | 3ba74f2 | 2015-07-24 18:46:17 +0200 | [diff] [blame] | 105 | assertEquals(1, kcn.docCount()); |
| 106 | |
| 107 | // andGroup in keyword field test |
| Akron | 4055017 | 2015-08-04 03:06:12 +0200 | [diff] [blame] | 108 | kcn.fromBuilder(cb.andGroup().with(cb.term("textClass", "reisen")) |
| 109 | .with(cb.term("textClass", "finanzen"))); |
| Akron | 3ba74f2 | 2015-07-24 18:46:17 +0200 | [diff] [blame] | 110 | assertEquals(1, kcn.docCount()); |
| 111 | |
| Akron | 4055017 | 2015-08-04 03:06:12 +0200 | [diff] [blame] | 112 | kcn.fromBuilder(cb.andGroup().with(cb.term("textClass", "reisen")) |
| 113 | .with(cb.term("textClass", "kultur"))); |
| Akron | 3ba74f2 | 2015-07-24 18:46:17 +0200 | [diff] [blame] | 114 | assertEquals(2, kcn.docCount()); |
| 115 | |
| Akron | 4055017 | 2015-08-04 03:06:12 +0200 | [diff] [blame] | 116 | kcn.fromBuilder(cb.andGroup().with(cb.term("textClass", "finanzen")) |
| 117 | .with(cb.term("textClass", "kultur"))); |
| Akron | 3ba74f2 | 2015-07-24 18:46:17 +0200 | [diff] [blame] | 118 | assertEquals(0, kcn.docCount()); |
| Akron | 80cba8d | 2015-07-27 17:27:46 +0200 | [diff] [blame] | 119 | |
| Akron | 1d63f27 | 2015-07-28 12:19:49 +0200 | [diff] [blame] | 120 | kcn.fromBuilder(cb.term("text", "mann")); |
| Akron | 80cba8d | 2015-07-27 17:27:46 +0200 | [diff] [blame] | 121 | assertEquals(3, kcn.docCount()); |
| 122 | |
| Akron | 1d63f27 | 2015-07-28 12:19:49 +0200 | [diff] [blame] | 123 | kcn.fromBuilder(cb.term("text", "frau")); |
| Akron | 80cba8d | 2015-07-27 17:27:46 +0200 | [diff] [blame] | 124 | assertEquals(1, kcn.docCount()); |
| Akron | 3ba74f2 | 2015-07-24 18:46:17 +0200 | [diff] [blame] | 125 | }; |
| 126 | |
| Akron | 4055017 | 2015-08-04 03:06:12 +0200 | [diff] [blame] | 127 | |
| Akron | 3ba74f2 | 2015-07-24 18:46:17 +0200 | [diff] [blame] | 128 | @Test |
| 129 | public void testIndexWithNegation () throws IOException { |
| 130 | ki = new KrillIndex(); |
| 131 | ki.addDoc(createDoc1()); |
| 132 | ki.addDoc(createDoc2()); |
| 133 | ki.addDoc(createDoc3()); |
| 134 | ki.commit(); |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 135 | CollectionBuilder cb = new CollectionBuilder(); |
| 136 | KrillCollection kcn = new KrillCollection(ki); |
| Akron | 3ba74f2 | 2015-07-24 18:46:17 +0200 | [diff] [blame] | 137 | |
| 138 | // Simple negation tests |
| 139 | kcn.fromBuilder(cb.term("author", "Frank").not()); |
| 140 | assertEquals(2, kcn.docCount()); |
| 141 | |
| 142 | kcn.fromBuilder(cb.term("textClass", "reisen").not()); |
| 143 | assertEquals(0, kcn.docCount()); |
| 144 | |
| 145 | kcn.fromBuilder(cb.term("textClass", "kultur").not()); |
| 146 | assertEquals(1, kcn.docCount()); |
| 147 | |
| 148 | // orGroup with simple Negation |
| Akron | 4055017 | 2015-08-04 03:06:12 +0200 | [diff] [blame] | 149 | kcn.fromBuilder(cb.orGroup().with(cb.term("textClass", "kultur").not()) |
| 150 | .with(cb.term("author", "Peter"))); |
| Akron | 3ba74f2 | 2015-07-24 18:46:17 +0200 | [diff] [blame] | 151 | assertEquals(2, kcn.docCount()); |
| 152 | |
| Akron | 4055017 | 2015-08-04 03:06:12 +0200 | [diff] [blame] | 153 | kcn.fromBuilder(cb.orGroup().with(cb.term("textClass", "kultur").not()) |
| 154 | .with(cb.term("author", "Sebastian"))); |
| Akron | 3ba74f2 | 2015-07-24 18:46:17 +0200 | [diff] [blame] | 155 | assertEquals(1, kcn.docCount()); |
| Akron | 3ba74f2 | 2015-07-24 18:46:17 +0200 | [diff] [blame] | 156 | }; |
| 157 | |
| Akron | 4055017 | 2015-08-04 03:06:12 +0200 | [diff] [blame] | 158 | |
| Akron | 3ba74f2 | 2015-07-24 18:46:17 +0200 | [diff] [blame] | 159 | @Test |
| Akron | 80cba8d | 2015-07-27 17:27:46 +0200 | [diff] [blame] | 160 | public void testIndexWithMultipleCommitsAndDeletes () throws IOException { |
| Akron | 3ba74f2 | 2015-07-24 18:46:17 +0200 | [diff] [blame] | 161 | ki = new KrillIndex(); |
| 162 | ki.addDoc(createDoc1()); |
| 163 | ki.addDoc(createDoc2()); |
| 164 | ki.commit(); |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 165 | CollectionBuilder cb = new CollectionBuilder(); |
| 166 | KrillCollection kcn = new KrillCollection(ki); |
| Akron | 3ba74f2 | 2015-07-24 18:46:17 +0200 | [diff] [blame] | 167 | |
| 168 | kcn.fromBuilder(cb.term("author", "Frank")); |
| 169 | assertEquals(1, kcn.docCount()); |
| 170 | kcn.fromBuilder(cb.term("author", "Peter")); |
| 171 | assertEquals(1, kcn.docCount()); |
| 172 | kcn.fromBuilder(cb.term("author", "Sebastian")); |
| 173 | assertEquals(0, kcn.docCount()); |
| 174 | kcn.fromBuilder(cb.term("author", "Michael").not()); |
| 175 | assertEquals(2, kcn.docCount()); |
| 176 | |
| 177 | // Add Sebastians doc |
| 178 | ki.addDoc(createDoc3()); |
| 179 | ki.commit(); |
| 180 | |
| 181 | kcn.fromBuilder(cb.term("author", "Frank")); |
| 182 | assertEquals(1, kcn.docCount()); |
| 183 | kcn.fromBuilder(cb.term("author", "Peter")); |
| 184 | assertEquals(1, kcn.docCount()); |
| 185 | kcn.fromBuilder(cb.term("author", "Sebastian")); |
| 186 | assertEquals(1, kcn.docCount()); |
| 187 | kcn.fromBuilder(cb.term("author", "Michael").not()); |
| 188 | assertEquals(3, kcn.docCount()); |
| 189 | |
| 190 | // Remove one document |
| 191 | ki.delDocs("author", "Peter"); |
| 192 | ki.commit(); |
| 193 | |
| 194 | kcn.fromBuilder(cb.term("author", "Frank")); |
| 195 | assertEquals(1, kcn.docCount()); |
| 196 | kcn.fromBuilder(cb.term("author", "Peter")); |
| 197 | assertEquals(0, kcn.docCount()); |
| 198 | kcn.fromBuilder(cb.term("author", "Sebastian")); |
| 199 | assertEquals(1, kcn.docCount()); |
| 200 | kcn.fromBuilder(cb.term("author", "Michael").not()); |
| 201 | assertEquals(2, kcn.docCount()); |
| Akron | 80cba8d | 2015-07-27 17:27:46 +0200 | [diff] [blame] | 202 | |
| 203 | // Readd Peter's doc |
| 204 | ki.addDoc(createDoc2()); |
| 205 | ki.commit(); |
| 206 | |
| 207 | kcn.fromBuilder(cb.term("author", "Frank")); |
| 208 | assertEquals(1, kcn.docCount()); |
| 209 | kcn.fromBuilder(cb.term("author", "Peter")); |
| 210 | assertEquals(1, kcn.docCount()); |
| 211 | kcn.fromBuilder(cb.term("author", "Sebastian")); |
| 212 | assertEquals(1, kcn.docCount()); |
| 213 | kcn.fromBuilder(cb.term("author", "Michael").not()); |
| 214 | assertEquals(3, kcn.docCount()); |
| Akron | 3ba74f2 | 2015-07-24 18:46:17 +0200 | [diff] [blame] | 215 | }; |
| 216 | |
| Akron | 4055017 | 2015-08-04 03:06:12 +0200 | [diff] [blame] | 217 | |
| Akron | 80cba8d | 2015-07-27 17:27:46 +0200 | [diff] [blame] | 218 | @Test |
| Akron | 1d63f27 | 2015-07-28 12:19:49 +0200 | [diff] [blame] | 219 | public void testIndexStream () throws IOException { |
| 220 | ki = new KrillIndex(); |
| 221 | FieldDocument fd = ki.addDoc(createDoc1()); |
| 222 | ki.commit(); |
| 223 | |
| 224 | Analyzer ana = new TextAnalyzer(); |
| 225 | TokenStream ts = fd.doc.getField("text").tokenStream(ana, null); |
| 226 | |
| Akron | 4055017 | 2015-08-04 03:06:12 +0200 | [diff] [blame] | 227 | CharTermAttribute charTermAttribute = ts |
| 228 | .addAttribute(CharTermAttribute.class); |
| Akron | 1d63f27 | 2015-07-28 12:19:49 +0200 | [diff] [blame] | 229 | ts.reset(); |
| 230 | |
| 231 | ts.incrementToken(); |
| 232 | assertEquals("der", charTermAttribute.toString()); |
| 233 | ts.incrementToken(); |
| 234 | assertEquals("alte", charTermAttribute.toString()); |
| 235 | ts.incrementToken(); |
| 236 | assertEquals("mann", charTermAttribute.toString()); |
| 237 | ts.incrementToken(); |
| 238 | assertEquals("ging", charTermAttribute.toString()); |
| 239 | ts.incrementToken(); |
| 240 | assertEquals("über", charTermAttribute.toString()); |
| 241 | ts.incrementToken(); |
| 242 | assertEquals("die", charTermAttribute.toString()); |
| 243 | ts.incrementToken(); |
| 244 | assertEquals("straße", charTermAttribute.toString()); |
| 245 | }; |
| 246 | |
| Akron | 4055017 | 2015-08-04 03:06:12 +0200 | [diff] [blame] | 247 | |
| Akron | 1d63f27 | 2015-07-28 12:19:49 +0200 | [diff] [blame] | 248 | @Test |
| Akron | 80cba8d | 2015-07-27 17:27:46 +0200 | [diff] [blame] | 249 | public void testIndexWithDateRanges () throws IOException { |
| 250 | ki = new KrillIndex(); |
| 251 | ki.addDoc(createDoc1()); |
| 252 | ki.addDoc(createDoc2()); |
| 253 | ki.addDoc(createDoc3()); |
| 254 | ki.commit(); |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 255 | CollectionBuilder cb = new CollectionBuilder(); |
| 256 | KrillCollection kcn = new KrillCollection(ki); |
| Akron | 80cba8d | 2015-07-27 17:27:46 +0200 | [diff] [blame] | 257 | |
| 258 | kcn.fromBuilder(cb.date("pubDate", "2005")); |
| 259 | assertEquals(3, kcn.docCount()); |
| 260 | kcn.fromBuilder(cb.date("pubDate", "2005-12")); |
| 261 | assertEquals(3, kcn.docCount()); |
| 262 | |
| 263 | kcn.fromBuilder(cb.date("pubDate", "2005-12-10")); |
| 264 | assertEquals(1, kcn.docCount()); |
| 265 | kcn.fromBuilder(cb.date("pubDate", "2005-12-16")); |
| 266 | assertEquals(1, kcn.docCount()); |
| 267 | kcn.fromBuilder(cb.date("pubDate", "2005-12-07")); |
| 268 | assertEquals(1, kcn.docCount()); |
| 269 | |
| 270 | kcn.fromBuilder(cb.since("pubDate", "2005-12-07")); |
| 271 | assertEquals(3, kcn.docCount()); |
| 272 | kcn.fromBuilder(cb.since("pubDate", "2005-12-10")); |
| 273 | assertEquals(2, kcn.docCount()); |
| 274 | kcn.fromBuilder(cb.since("pubDate", "2005-12-16")); |
| 275 | assertEquals(1, kcn.docCount()); |
| 276 | |
| 277 | kcn.fromBuilder(cb.till("pubDate", "2005-12-16")); |
| 278 | assertEquals(3, kcn.docCount()); |
| 279 | kcn.fromBuilder(cb.till("pubDate", "2005-12-10")); |
| 280 | assertEquals(2, kcn.docCount()); |
| 281 | kcn.fromBuilder(cb.till("pubDate", "2005-12-07")); |
| 282 | assertEquals(1, kcn.docCount()); |
| 283 | |
| 284 | kcn.fromBuilder(cb.date("pubDate", "2005-12-10").not()); |
| 285 | assertEquals(2, kcn.docCount()); |
| 286 | kcn.fromBuilder(cb.date("pubDate", "2005-12-16").not()); |
| 287 | assertEquals(2, kcn.docCount()); |
| 288 | kcn.fromBuilder(cb.date("pubDate", "2005-12-07").not()); |
| 289 | assertEquals(2, kcn.docCount()); |
| 290 | kcn.fromBuilder(cb.date("pubDate", "2005-12-09").not()); |
| 291 | assertEquals(3, kcn.docCount()); |
| 292 | |
| 293 | |
| 294 | kcn.fromBuilder(cb.till("pubDate", "2005-12-16").not()); |
| 295 | assertEquals(0, kcn.docCount()); |
| 296 | kcn.fromBuilder(cb.till("pubDate", "2005-12-15").not()); |
| 297 | assertEquals(1, kcn.docCount()); |
| 298 | kcn.fromBuilder(cb.till("pubDate", "2005-12-10").not()); |
| 299 | assertEquals(1, kcn.docCount()); |
| 300 | kcn.fromBuilder(cb.till("pubDate", "2005-12-09").not()); |
| 301 | assertEquals(2, kcn.docCount()); |
| 302 | kcn.fromBuilder(cb.till("pubDate", "2005-12-07").not()); |
| 303 | assertEquals(2, kcn.docCount()); |
| 304 | kcn.fromBuilder(cb.till("pubDate", "2005-12-06").not()); |
| 305 | assertEquals(3, kcn.docCount()); |
| 306 | }; |
| 307 | |
| 308 | |
| 309 | @Test |
| 310 | public void testIndexWithRegexes () throws IOException { |
| 311 | ki = new KrillIndex(); |
| 312 | |
| 313 | ki.addDoc(createDoc1()); |
| 314 | ki.addDoc(createDoc2()); |
| 315 | ki.addDoc(createDoc3()); |
| 316 | ki.commit(); |
| 317 | |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 318 | CollectionBuilder cb = new CollectionBuilder(); |
| 319 | KrillCollection kcn = new KrillCollection(ki); |
| Akron | 80cba8d | 2015-07-27 17:27:46 +0200 | [diff] [blame] | 320 | |
| 321 | kcn.fromBuilder(cb.re("author", "Fran.*")); |
| 322 | assertEquals(1, kcn.docCount()); |
| 323 | kcn.fromBuilder(cb.re("author", "Blin.*")); |
| 324 | assertEquals(0, kcn.docCount()); |
| 325 | kcn.fromBuilder(cb.re("author", "Frank|Peter")); |
| 326 | assertEquals(2, kcn.docCount()); |
| 327 | |
| Akron | 1d63f27 | 2015-07-28 12:19:49 +0200 | [diff] [blame] | 328 | // "Frau" doesn't work! |
| 329 | kcn.fromBuilder(cb.term("text", "frau")); |
| Akron | 80cba8d | 2015-07-27 17:27:46 +0200 | [diff] [blame] | 330 | assertEquals(1, kcn.docCount()); |
| 331 | |
| Akron | 1d63f27 | 2015-07-28 12:19:49 +0200 | [diff] [blame] | 332 | kcn.fromBuilder(cb.re("text", "frau")); |
| Akron | 80cba8d | 2015-07-27 17:27:46 +0200 | [diff] [blame] | 333 | assertEquals(1, kcn.docCount()); |
| 334 | |
| Akron | 1d63f27 | 2015-07-28 12:19:49 +0200 | [diff] [blame] | 335 | kcn.fromBuilder(cb.re("text", "frau|mann")); |
| Akron | 80cba8d | 2015-07-27 17:27:46 +0200 | [diff] [blame] | 336 | assertEquals(3, kcn.docCount()); |
| 337 | }; |
| 338 | |
| Akron | 3ba74f2 | 2015-07-24 18:46:17 +0200 | [diff] [blame] | 339 | |
| Akron | fd05f50 | 2015-07-30 18:34:26 +0200 | [diff] [blame] | 340 | @Test |
| 341 | public void filterExampleFromLegacy () throws Exception { |
| 342 | |
| 343 | // Construct index |
| 344 | KrillIndex ki = new KrillIndex(); |
| 345 | // Indexing test files |
| 346 | for (String i : new String[] { "00001", "00002", "00003", "00004", |
| 347 | "00005", "00006", "02439" }) { |
| 348 | ki.addDoc( |
| 349 | getClass().getResourceAsStream("/wiki/" + i + ".json.gz"), |
| 350 | true); |
| 351 | }; |
| 352 | ki.commit(); |
| 353 | |
| 354 | // Create Virtual collections: |
| 355 | KrillCollection kc = new KrillCollection(ki); |
| 356 | |
| 357 | assertEquals("Documents", 7, kc.numberOf("documents")); |
| 358 | |
| 359 | // The virtual collection consists of all documents that have |
| 360 | // the textClass "reisen" and "freizeit" |
| 361 | |
| 362 | /* kc.filter(kf.and("textClass", "reisen").and("textClass", |
| 363 | "freizeit-unterhaltung")); |
| 364 | */ |
| 365 | |
| Akron | 4055017 | 2015-08-04 03:06:12 +0200 | [diff] [blame] | 366 | kc.fromBuilder(kc.build().andGroup() |
| 367 | .with(kc.build().term("textClass", "reisen")) |
| 368 | .with(kc.build().term("textClass", "freizeit-unterhaltung"))); |
| Akron | fd05f50 | 2015-07-30 18:34:26 +0200 | [diff] [blame] | 369 | |
| 370 | assertEquals("Documents", 5, kc.numberOf("documents")); |
| 371 | assertEquals("Tokens", 1678, kc.numberOf("tokens")); |
| 372 | assertEquals("Sentences", 194, kc.numberOf("sentences")); |
| 373 | assertEquals("Paragraphs", 139, kc.numberOf("paragraphs")); |
| 374 | |
| 375 | |
| 376 | // Subset this to all documents that have also the text |
| 377 | // kc.filter(kf.and("textClass", "kultur")); |
| 378 | /* |
| 379 | kc.fromBuilder( |
| 380 | kc.build().andGroup().with( |
| 381 | kc.getBuilder() |
| 382 | ).with( |
| 383 | kc.build().term("textClass", "kultur") |
| 384 | ) |
| 385 | ); |
| 386 | */ |
| 387 | |
| 388 | kc.filter(kc.build().term("textClass", "kultur")); |
| 389 | |
| 390 | assertEquals("Documents", 1, kc.numberOf("documents")); |
| 391 | assertEquals("Tokens", 405, kc.numberOf("tokens")); |
| 392 | assertEquals("Sentences", 75, kc.numberOf("sentences")); |
| 393 | assertEquals("Paragraphs", 48, kc.numberOf("paragraphs")); |
| 394 | |
| 395 | |
| 396 | // kc.filter(kf.and("corpusID", "WPD")); |
| 397 | kc.filter(kc.build().term("corpusID", "WPD")); |
| 398 | |
| 399 | assertEquals("Documents", 1, kc.numberOf("documents")); |
| 400 | assertEquals("Tokens", 405, kc.numberOf("tokens")); |
| 401 | assertEquals("Sentences", 75, kc.numberOf("sentences")); |
| 402 | assertEquals("Paragraphs", 48, kc.numberOf("paragraphs")); |
| 403 | |
| 404 | // Create a query |
| 405 | QueryBuilder kq = new QueryBuilder("tokens"); |
| 406 | SpanQuery query = kq.seg("opennlp/p:NN").with("tt/p:NN").toQuery(); |
| 407 | |
| Akron | 4055017 | 2015-08-04 03:06:12 +0200 | [diff] [blame] | 408 | Result kr = ki.search(kc, query, 0, (short) 20, true, (short) 5, true, |
| 409 | (short) 5); |
| Akron | fd05f50 | 2015-07-30 18:34:26 +0200 | [diff] [blame] | 410 | assertEquals(kr.getTotalResults(), 70); |
| 411 | |
| 412 | |
| 413 | kc.extend(kc.build().term("textClass", "uninteresting")); |
| 414 | assertEquals("Documents", 1, kc.numberOf("documents")); |
| 415 | |
| 416 | kc.extend(kc.build().term("textClass", "wissenschaft")); |
| 417 | |
| 418 | assertEquals("Documents", 3, kc.numberOf("documents")); |
| 419 | assertEquals("Tokens", 1669, kc.numberOf("tokens")); |
| 420 | assertEquals("Sentences", 188, kc.numberOf("sentences")); |
| 421 | assertEquals("Paragraphs", 130, kc.numberOf("paragraphs")); |
| 422 | // System.err.println(kr.toJSON()); |
| 423 | }; |
| 424 | |
| 425 | |
| 426 | @Test |
| 427 | public void filterExampleAtomicLegacy () throws Exception { |
| 428 | |
| 429 | // That's exactly the same test class, but with multiple atomic indices |
| 430 | |
| 431 | // Construct index |
| 432 | KrillIndex ki = new KrillIndex(); |
| 433 | // Indexing test files |
| 434 | for (String i : new String[] { "00001", "00002", "00003", "00004", |
| 435 | "00005", "00006", "02439" }) { |
| 436 | ki.addDoc( |
| 437 | getClass().getResourceAsStream("/wiki/" + i + ".json.gz"), |
| 438 | true); |
| 439 | ki.commit(); |
| 440 | }; |
| 441 | |
| 442 | CollectionBuilder kf = new CollectionBuilder(); |
| 443 | |
| 444 | // Create Virtual collections: |
| 445 | KrillCollection kc = new KrillCollection(ki); |
| 446 | |
| 447 | assertEquals("Documents", 7, kc.numberOf("documents")); |
| 448 | |
| 449 | // If this is set - everything is fine automatically ... |
| 450 | kc.filter(kc.build().term("corpusID", "WPD")); |
| 451 | |
| 452 | assertEquals("Documents", 7, kc.numberOf("documents")); |
| 453 | |
| 454 | // The virtual collection consists of all documents that have the textClass "reisen" and "freizeit" |
| 455 | |
| 456 | /* |
| 457 | kc.filter(kf.and("textClass", "reisen").and("textClass", |
| 458 | "freizeit-unterhaltung")); |
| 459 | */ |
| Akron | 4055017 | 2015-08-04 03:06:12 +0200 | [diff] [blame] | 460 | kc.filter(kc.build().andGroup() |
| 461 | .with(kc.build().term("textClass", "reisen")) |
| 462 | .with(kc.build().term("textClass", "freizeit-unterhaltung"))); |
| Akron | fd05f50 | 2015-07-30 18:34:26 +0200 | [diff] [blame] | 463 | |
| 464 | assertEquals("Documents", 5, kc.numberOf("documents")); |
| 465 | assertEquals("Tokens", 1678, kc.numberOf("tokens")); |
| 466 | assertEquals("Sentences", 194, kc.numberOf("sentences")); |
| 467 | assertEquals("Paragraphs", 139, kc.numberOf("paragraphs")); |
| 468 | |
| 469 | // Subset this to all documents that have also the text |
| 470 | // kc.filter(kf.and("textClass", "kultur")); |
| 471 | |
| 472 | kc.filter(kc.build().term("textClass", "kultur")); |
| 473 | |
| 474 | assertEquals("Documents", 1, kc.numberOf("documents")); |
| 475 | assertEquals("Tokens", 405, kc.numberOf("tokens")); |
| 476 | assertEquals("Sentences", 75, kc.numberOf("sentences")); |
| 477 | assertEquals("Paragraphs", 48, kc.numberOf("paragraphs")); |
| 478 | |
| 479 | // This is already filtered though ... |
| 480 | // kc.filter(kf.and("corpusID", "WPD")); |
| 481 | kc.filter(kc.build().term("corpusID", "WPD")); |
| 482 | |
| 483 | assertEquals("Documents", 1, kc.numberOf("documents")); |
| 484 | assertEquals("Tokens", 405, kc.numberOf("tokens")); |
| 485 | assertEquals("Sentences", 75, kc.numberOf("sentences")); |
| 486 | assertEquals("Paragraphs", 48, kc.numberOf("paragraphs")); |
| 487 | |
| 488 | // Create a query |
| 489 | QueryBuilder kq = new QueryBuilder("tokens"); |
| 490 | SpanQuery query = kq.seg("opennlp/p:NN").with("tt/p:NN").toQuery(); |
| 491 | |
| Akron | 4055017 | 2015-08-04 03:06:12 +0200 | [diff] [blame] | 492 | Result kr = ki.search(kc, query, 0, (short) 20, true, (short) 5, true, |
| 493 | (short) 5); |
| Akron | fd05f50 | 2015-07-30 18:34:26 +0200 | [diff] [blame] | 494 | assertEquals(kr.getTotalResults(), 70); |
| 495 | |
| 496 | // kc.extend(kf.and("textClass", "uninteresting")); |
| 497 | kc.extend(kc.build().term("textClass", "uninteresting")); |
| 498 | |
| Akron | fd05f50 | 2015-07-30 18:34:26 +0200 | [diff] [blame] | 499 | assertEquals("Documents", 1, kc.numberOf("documents")); |
| 500 | |
| Akron | aa74ec6 | 2015-07-31 17:22:55 +0200 | [diff] [blame] | 501 | kc.extend(kc.build().term("textClass", "wissenschaft")); |
| Akron | fd05f50 | 2015-07-30 18:34:26 +0200 | [diff] [blame] | 502 | |
| 503 | assertEquals("Documents", 3, kc.numberOf("documents")); |
| 504 | assertEquals("Tokens", 1669, kc.numberOf("tokens")); |
| 505 | assertEquals("Sentences", 188, kc.numberOf("sentences")); |
| 506 | assertEquals("Paragraphs", 130, kc.numberOf("paragraphs")); |
| Akron | aa74ec6 | 2015-07-31 17:22:55 +0200 | [diff] [blame] | 507 | |
| 508 | // System.err.println(kc.toString()); |
| 509 | // Test collectionbuilder simplifier! |
| 510 | /* |
| 511 | OrGroup( |
| 512 | AndGroup( |
| 513 | corpusID:WPD |
| 514 | textClass:reisen |
| 515 | textClass:freizeit-unterhaltung |
| 516 | textClass:kultur |
| 517 | corpusID:WPD |
| 518 | ) |
| 519 | textClass:uninteresting |
| 520 | textClass:wissenschaft |
| 521 | ) |
| Akron | fd05f50 | 2015-07-30 18:34:26 +0200 | [diff] [blame] | 522 | */ |
| Akron | aa74ec6 | 2015-07-31 17:22:55 +0200 | [diff] [blame] | 523 | |
| 524 | assertTrue(ki.delDocs("textClass", "wissenschaft")); |
| 525 | ki.commit(); |
| 526 | |
| 527 | assertEquals("Documents", 1, kc.numberOf("documents")); |
| 528 | assertEquals("Tokens", 405, kc.numberOf("tokens")); |
| 529 | assertEquals("Sentences", 75, kc.numberOf("sentences")); |
| 530 | assertEquals("Paragraphs", 48, kc.numberOf("paragraphs")); |
| 531 | }; |
| 532 | |
| Akron | 4055017 | 2015-08-04 03:06:12 +0200 | [diff] [blame] | 533 | |
| Akron | aa74ec6 | 2015-07-31 17:22:55 +0200 | [diff] [blame] | 534 | @Test |
| 535 | public void filterExample2Legacy () throws Exception { |
| 536 | |
| 537 | // Construct index |
| 538 | KrillIndex ki = new KrillIndex(); |
| 539 | // Indexing test files |
| 540 | for (String i : new String[] { "00001", "00002", "00003", "00004", |
| 541 | "00005", "00006", "02439" }) { |
| 542 | ki.addDoc( |
| 543 | getClass().getResourceAsStream("/wiki/" + i + ".json.gz"), |
| 544 | true); |
| 545 | }; |
| 546 | ki.commit(); |
| 547 | |
| 548 | ki.addDoc(getClass() |
| 549 | .getResourceAsStream("/wiki/00012-fakemeta.json.gz"), true); |
| 550 | |
| 551 | ki.commit(); |
| 552 | |
| 553 | /* |
| 554 | CollectionBuilderLegacy kf = new CollectionBuilderLegacy(); |
| 555 | |
| 556 | // Create Virtual collections: |
| 557 | KrillCollectionLegacy kc = new KrillCollectionLegacy(ki); |
| 558 | kc.filter(kf.and("textClass", "reisen").and("textClass", |
| 559 | "freizeit-unterhaltung")); |
| 560 | */ |
| 561 | |
| 562 | KrillCollection kc = new KrillCollection(ki); |
| 563 | CollectionBuilder cb = kc.build(); |
| Akron | 4055017 | 2015-08-04 03:06:12 +0200 | [diff] [blame] | 564 | kc.filter(cb.andGroup().with(cb.term("textClass", "reisen")) |
| 565 | .with(cb.term("textClass", "freizeit-unterhaltung"))); |
| Akron | aa74ec6 | 2015-07-31 17:22:55 +0200 | [diff] [blame] | 566 | |
| 567 | assertEquals("Documents", 5, kc.numberOf("documents")); |
| 568 | assertEquals("Tokens", 1678, kc.numberOf("tokens")); |
| 569 | assertEquals("Sentences", 194, kc.numberOf("sentences")); |
| 570 | assertEquals("Paragraphs", 139, kc.numberOf("paragraphs")); |
| 571 | |
| 572 | |
| 573 | // Create a query |
| 574 | QueryBuilder kq = new QueryBuilder("tokens"); |
| 575 | SpanQuery query = kq.seg("opennlp/p:NN").with("tt/p:NN").toQuery(); |
| 576 | |
| 577 | |
| Akron | 4055017 | 2015-08-04 03:06:12 +0200 | [diff] [blame] | 578 | Result kr = ki.search(kc, query, 0, (short) 20, true, (short) 5, true, |
| 579 | (short) 5); |
| Akron | aa74ec6 | 2015-07-31 17:22:55 +0200 | [diff] [blame] | 580 | assertEquals(kr.getTotalResults(), 369); |
| 581 | |
| 582 | // kc.filter(kf.and("corpusID", "QQQ")); |
| 583 | kc.filter(cb.term("corpusID", "QQQ")); |
| 584 | |
| 585 | assertEquals("Documents", 0, kc.numberOf("documents")); |
| 586 | assertEquals("Tokens", 0, kc.numberOf("tokens")); |
| 587 | assertEquals("Sentences", 0, kc.numberOf("sentences")); |
| 588 | assertEquals("Paragraphs", 0, kc.numberOf("paragraphs")); |
| 589 | |
| Akron | 4055017 | 2015-08-04 03:06:12 +0200 | [diff] [blame] | 590 | kr = ki.search(kc, query, 0, (short) 20, true, (short) 5, true, |
| 591 | (short) 5); |
| Akron | aa74ec6 | 2015-07-31 17:22:55 +0200 | [diff] [blame] | 592 | assertEquals(kr.getTotalResults(), 0); |
| 593 | }; |
| 594 | |
| 595 | |
| 596 | @Test |
| 597 | public void uidCollectionLegacy () throws IOException { |
| 598 | |
| 599 | // Construct index |
| 600 | KrillIndex ki = new KrillIndex(); |
| 601 | // Indexing test files |
| 602 | int uid = 1; |
| 603 | for (String i : new String[] { "00001", "00002", "00003", "00004", |
| 604 | "00005", "00006", "02439" }) { |
| 605 | FieldDocument fd = ki.addDoc(uid++, |
| 606 | getClass().getResourceAsStream("/wiki/" + i + ".json.gz"), |
| 607 | true); |
| 608 | }; |
| 609 | ki.commit(); |
| 610 | |
| 611 | assertEquals("Documents", 7, ki.numberOf("documents")); |
| 612 | assertEquals("Paragraphs", 174, ki.numberOf("paragraphs")); |
| 613 | assertEquals("Sentences", 281, ki.numberOf("sentences")); |
| 614 | assertEquals("Tokens", 2661, ki.numberOf("tokens")); |
| 615 | |
| 616 | SpanQuery sq = new SpanTermQuery(new Term("tokens", "s:der")); |
| 617 | Result kr = ki.search(sq, (short) 10); |
| 618 | assertEquals(86, kr.getTotalResults()); |
| 619 | |
| 620 | // Create Virtual collections: |
| 621 | KrillCollection kc = new KrillCollection(); |
| 622 | kc.filterUIDs(new String[] { "2", "3", "4" }); |
| 623 | kc.setIndex(ki); |
| 624 | assertEquals("Documents", 3, kc.numberOf("documents")); |
| 625 | |
| 626 | assertEquals("Paragraphs", 46, kc.numberOf("paragraphs")); |
| 627 | assertEquals("Sentences", 103, kc.numberOf("sentences")); |
| 628 | assertEquals("Tokens", 1229, kc.numberOf("tokens")); |
| 629 | |
| 630 | kr = ki.search(kc, sq, 0, (short) 20, true, (short) 5, true, (short) 5); |
| 631 | |
| 632 | assertEquals((long) 39, kr.getTotalResults()); |
| 633 | }; |
| 634 | |
| Akron | 4055017 | 2015-08-04 03:06:12 +0200 | [diff] [blame] | 635 | |
| Akron | aa74ec6 | 2015-07-31 17:22:55 +0200 | [diff] [blame] | 636 | @Test |
| 637 | public void uidCollectionWithDeletions () throws IOException { |
| 638 | |
| 639 | // Construct index |
| 640 | KrillIndex ki = new KrillIndex(); |
| 641 | // Indexing test files |
| 642 | int uid = 1; |
| 643 | for (String i : new String[] { "00001", "00002", "00003", "00004", |
| 644 | "00005", "00006", "02439" }) { |
| 645 | FieldDocument fd = ki.addDoc(uid++, |
| 646 | getClass().getResourceAsStream("/wiki/" + i + ".json.gz"), |
| 647 | true); |
| 648 | }; |
| 649 | ki.commit(); |
| 650 | |
| 651 | |
| 652 | assertEquals("Documents", 7, ki.numberOf("documents")); |
| 653 | assertEquals("Paragraphs", 174, ki.numberOf("paragraphs")); |
| 654 | assertEquals("Sentences", 281, ki.numberOf("sentences")); |
| 655 | assertEquals("Tokens", 2661, ki.numberOf("tokens")); |
| 656 | |
| 657 | assertTrue(ki.delDoc(3)); |
| 658 | ki.commit(); |
| 659 | |
| 660 | assertEquals("Documents", 6, ki.numberOf("documents")); |
| 661 | |
| 662 | assertEquals("Paragraphs", 146, ki.numberOf("paragraphs")); |
| 663 | assertEquals("Sentences", 212, ki.numberOf("sentences")); |
| 664 | assertEquals("Tokens", 2019, ki.numberOf("tokens")); |
| 665 | |
| 666 | assertTrue(ki.delDoc(2)); |
| 667 | assertTrue(ki.delDoc(3)); |
| 668 | assertTrue(ki.delDoc(4)); |
| 669 | assertTrue(ki.delDoc(5)); |
| 670 | assertTrue(ki.delDoc(6)); |
| 671 | assertTrue(ki.delDoc(7)); |
| 672 | ki.commit(); |
| 673 | |
| 674 | assertEquals("Documents", 1, ki.numberOf("documents")); |
| 675 | assertEquals("Paragraphs", 75, ki.numberOf("paragraphs")); |
| Akron | fd05f50 | 2015-07-30 18:34:26 +0200 | [diff] [blame] | 676 | }; |
| 677 | |
| 678 | |
| Akron | 3ba74f2 | 2015-07-24 18:46:17 +0200 | [diff] [blame] | 679 | private FieldDocument createDoc1 () { |
| 680 | FieldDocument fd = new FieldDocument(); |
| 681 | fd.addString("ID", "doc-1"); |
| 682 | fd.addString("author", "Frank"); |
| 683 | fd.addKeyword("textClass", "Nachricht Kultur Reisen"); |
| 684 | fd.addInt("pubDate", 20051210); |
| 685 | fd.addText("text", "Der alte Mann ging über die Straße"); |
| 686 | return fd; |
| 687 | }; |
| 688 | |
| Akron | 4055017 | 2015-08-04 03:06:12 +0200 | [diff] [blame] | 689 | |
| Akron | 3ba74f2 | 2015-07-24 18:46:17 +0200 | [diff] [blame] | 690 | private FieldDocument createDoc2 () { |
| 691 | FieldDocument fd = new FieldDocument(); |
| 692 | fd.addString("ID", "doc-2"); |
| 693 | fd.addString("author", "Peter"); |
| 694 | fd.addKeyword("textClass", "Kultur Reisen"); |
| 695 | fd.addInt("pubDate", 20051207); |
| 696 | fd.addText("text", "Der junge Mann hatte keine andere Wahl"); |
| 697 | return fd; |
| 698 | }; |
| 699 | |
| Akron | 4055017 | 2015-08-04 03:06:12 +0200 | [diff] [blame] | 700 | |
| Akron | 3ba74f2 | 2015-07-24 18:46:17 +0200 | [diff] [blame] | 701 | private FieldDocument createDoc3 () { |
| 702 | FieldDocument fd = new FieldDocument(); |
| 703 | fd.addString("ID", "doc-3"); |
| 704 | fd.addString("author", "Sebastian"); |
| 705 | fd.addKeyword("textClass", "Reisen Finanzen"); |
| 706 | fd.addInt("pubDate", 20051216); |
| 707 | fd.addText("text", "Die Frau und der Mann küssten sich"); |
| 708 | return fd; |
| 709 | }; |
| 710 | }; |