| package de.ids_mannheim.korap.collection; |
| |
| import java.io.IOException; |
| |
| import de.ids_mannheim.korap.KrillIndex; |
| import de.ids_mannheim.korap.KrillCollection; |
| import de.ids_mannheim.korap.collection.CollectionBuilder; |
| import de.ids_mannheim.korap.index.FieldDocument; |
| import de.ids_mannheim.korap.index.TextAnalyzer; |
| import de.ids_mannheim.korap.response.Result; |
| import de.ids_mannheim.korap.KrillQuery; |
| import de.ids_mannheim.korap.query.QueryBuilder; |
| |
| import org.apache.lucene.analysis.Analyzer; |
| import org.apache.lucene.analysis.TokenStream; |
| import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; |
| import org.apache.lucene.index.Term; |
| import org.apache.lucene.search.spans.SpanOrQuery; |
| import org.apache.lucene.search.spans.SpanQuery; |
| import org.apache.lucene.search.spans.SpanTermQuery; |
| import org.apache.lucene.search.spans.SpanQuery; |
| |
| import static org.junit.Assert.*; |
| import org.junit.Test; |
| import org.junit.Ignore; |
| import org.junit.runner.RunWith; |
| import org.junit.runners.JUnit4; |
| |
| @RunWith(JUnit4.class) |
| public class TestKrillCollectionIndex { |
| private KrillIndex ki; |
| |
| |
| @Test |
| public void testIndexWithCollectionBuilder () throws IOException { |
| ki = new KrillIndex(); |
| ki.addDoc(createDoc1()); |
| ki.addDoc(createDoc2()); |
| ki.addDoc(createDoc3()); |
| ki.commit(); |
| CollectionBuilder cb = new CollectionBuilder(); |
| KrillCollection kcn = new KrillCollection(ki); |
| |
| // Simple string tests |
| kcn.fromBuilder(cb.term("author", "Frank")); |
| assertEquals(1, kcn.docCount()); |
| |
| kcn.fromBuilder(cb.term("author", "Peter")); |
| assertEquals(1, kcn.docCount()); |
| |
| kcn.fromBuilder(cb.term("author", "Sebastian")); |
| assertEquals(1, kcn.docCount()); |
| |
| kcn.fromBuilder(cb.term("author", "Michael")); |
| assertEquals(0, kcn.docCount()); |
| |
| kcn.fromBuilder(cb.term("textClass", "reisen")); |
| assertEquals(3, kcn.docCount()); |
| |
| kcn.fromBuilder(cb.term("textClass", "kultur")); |
| assertEquals(2, kcn.docCount()); |
| |
| kcn.fromBuilder(cb.term("textClass", "finanzen")); |
| assertEquals(1, kcn.docCount()); |
| |
| // Simple orGroup tests |
| kcn.fromBuilder(cb.orGroup().with(cb.term("author", "Frank")) |
| .with(cb.term("author", "Michael"))); |
| assertEquals(1, kcn.docCount()); |
| |
| kcn.fromBuilder(cb.orGroup().with(cb.term("author", "Frank")) |
| .with(cb.term("author", "Sebastian"))); |
| assertEquals(2, kcn.docCount()); |
| |
| kcn.fromBuilder(cb.orGroup().with(cb.term("author", "Frank")) |
| .with(cb.term("author", "Sebastian")) |
| .with(cb.term("author", "Peter"))); |
| assertEquals(3, kcn.docCount()); |
| |
| kcn.fromBuilder(cb.orGroup().with(cb.term("author", "Huhu")) |
| .with(cb.term("author", "Haha")) |
| .with(cb.term("author", "Hehe"))); |
| assertEquals(0, kcn.docCount()); |
| |
| // Multi field orGroup tests |
| kcn.fromBuilder(cb.orGroup().with(cb.term("ID", "doc-1")) |
| .with(cb.term("author", "Peter"))); |
| assertEquals(2, kcn.docCount()); |
| |
| kcn.fromBuilder(cb.orGroup().with(cb.term("ID", "doc-1")) |
| .with(cb.term("author", "Frank"))); |
| assertEquals(1, kcn.docCount()); |
| |
| kcn.fromBuilder(cb.orGroup().with(cb.term("ID", "doc-1")) |
| .with(cb.term("author", "Michael"))); |
| assertEquals(1, kcn.docCount()); |
| |
| // Simple andGroup tests |
| kcn.fromBuilder(cb.andGroup().with(cb.term("author", "Frank")) |
| .with(cb.term("author", "Michael"))); |
| assertEquals(0, kcn.docCount()); |
| |
| kcn.fromBuilder(cb.andGroup().with(cb.term("ID", "doc-1")) |
| .with(cb.term("author", "Frank"))); |
| assertEquals(1, kcn.docCount()); |
| |
| // andGroup in keyword field test |
| kcn.fromBuilder(cb.andGroup().with(cb.term("textClass", "reisen")) |
| .with(cb.term("textClass", "finanzen"))); |
| assertEquals(1, kcn.docCount()); |
| |
| kcn.fromBuilder(cb.andGroup().with(cb.term("textClass", "reisen")) |
| .with(cb.term("textClass", "kultur"))); |
| assertEquals(2, kcn.docCount()); |
| |
| kcn.fromBuilder(cb.andGroup().with(cb.term("textClass", "finanzen")) |
| .with(cb.term("textClass", "kultur"))); |
| assertEquals(0, kcn.docCount()); |
| |
| kcn.fromBuilder(cb.term("text", "mann")); |
| assertEquals(3, kcn.docCount()); |
| |
| kcn.fromBuilder(cb.term("text", "frau")); |
| assertEquals(1, kcn.docCount()); |
| }; |
| |
| |
| @Test |
| public void testIndexWithNegation () throws IOException { |
| ki = new KrillIndex(); |
| ki.addDoc(createDoc1()); |
| ki.addDoc(createDoc2()); |
| ki.addDoc(createDoc3()); |
| ki.commit(); |
| CollectionBuilder cb = new CollectionBuilder(); |
| KrillCollection kcn = new KrillCollection(ki); |
| |
| // Simple negation tests |
| kcn.fromBuilder(cb.term("author", "Frank").not()); |
| assertEquals(2, kcn.docCount()); |
| |
| kcn.fromBuilder(cb.term("textClass", "reisen").not()); |
| assertEquals(0, kcn.docCount()); |
| |
| kcn.fromBuilder(cb.term("textClass", "kultur").not()); |
| assertEquals(1, kcn.docCount()); |
| |
| // orGroup with simple Negation |
| kcn.fromBuilder(cb.orGroup().with(cb.term("textClass", "kultur").not()) |
| .with(cb.term("author", "Peter"))); |
| assertEquals(2, kcn.docCount()); |
| |
| kcn.fromBuilder(cb.orGroup().with(cb.term("textClass", "kultur").not()) |
| .with(cb.term("author", "Sebastian"))); |
| assertEquals(1, kcn.docCount()); |
| }; |
| |
| |
| @Test |
| public void testIndexWithMultipleCommitsAndDeletes () throws IOException { |
| ki = new KrillIndex(); |
| ki.addDoc(createDoc1()); |
| ki.addDoc(createDoc2()); |
| ki.commit(); |
| CollectionBuilder cb = new CollectionBuilder(); |
| KrillCollection kcn = new KrillCollection(ki); |
| |
| kcn.fromBuilder(cb.term("author", "Frank")); |
| assertEquals(1, kcn.docCount()); |
| kcn.fromBuilder(cb.term("author", "Peter")); |
| assertEquals(1, kcn.docCount()); |
| kcn.fromBuilder(cb.term("author", "Sebastian")); |
| assertEquals(0, kcn.docCount()); |
| kcn.fromBuilder(cb.term("author", "Michael").not()); |
| assertEquals(2, kcn.docCount()); |
| |
| // Add Sebastians doc |
| ki.addDoc(createDoc3()); |
| ki.commit(); |
| |
| kcn.fromBuilder(cb.term("author", "Frank")); |
| assertEquals(1, kcn.docCount()); |
| kcn.fromBuilder(cb.term("author", "Peter")); |
| assertEquals(1, kcn.docCount()); |
| kcn.fromBuilder(cb.term("author", "Sebastian")); |
| assertEquals(1, kcn.docCount()); |
| kcn.fromBuilder(cb.term("author", "Michael").not()); |
| assertEquals(3, kcn.docCount()); |
| |
| // Remove one document |
| ki.delDocs("author", "Peter"); |
| ki.commit(); |
| |
| kcn.fromBuilder(cb.term("author", "Frank")); |
| assertEquals(1, kcn.docCount()); |
| kcn.fromBuilder(cb.term("author", "Peter")); |
| assertEquals(0, kcn.docCount()); |
| kcn.fromBuilder(cb.term("author", "Sebastian")); |
| assertEquals(1, kcn.docCount()); |
| kcn.fromBuilder(cb.term("author", "Michael").not()); |
| assertEquals(2, kcn.docCount()); |
| |
| // Readd Peter's doc |
| ki.addDoc(createDoc2()); |
| ki.commit(); |
| |
| kcn.fromBuilder(cb.term("author", "Frank")); |
| assertEquals(1, kcn.docCount()); |
| kcn.fromBuilder(cb.term("author", "Peter")); |
| assertEquals(1, kcn.docCount()); |
| kcn.fromBuilder(cb.term("author", "Sebastian")); |
| assertEquals(1, kcn.docCount()); |
| kcn.fromBuilder(cb.term("author", "Michael").not()); |
| assertEquals(3, kcn.docCount()); |
| }; |
| |
| |
| @Test |
| public void testIndexStream () throws IOException { |
| ki = new KrillIndex(); |
| FieldDocument fd = ki.addDoc(createDoc1()); |
| ki.commit(); |
| |
| Analyzer ana = new TextAnalyzer(); |
| TokenStream ts = fd.doc.getField("text").tokenStream(ana, null); |
| |
| CharTermAttribute charTermAttribute = ts |
| .addAttribute(CharTermAttribute.class); |
| ts.reset(); |
| |
| ts.incrementToken(); |
| assertEquals("der", charTermAttribute.toString()); |
| ts.incrementToken(); |
| assertEquals("alte", charTermAttribute.toString()); |
| ts.incrementToken(); |
| assertEquals("mann", charTermAttribute.toString()); |
| ts.incrementToken(); |
| assertEquals("ging", charTermAttribute.toString()); |
| ts.incrementToken(); |
| assertEquals("über", charTermAttribute.toString()); |
| ts.incrementToken(); |
| assertEquals("die", charTermAttribute.toString()); |
| ts.incrementToken(); |
| assertEquals("straße", charTermAttribute.toString()); |
| }; |
| |
| |
| @Test |
| public void testIndexWithDateRanges () throws IOException { |
| ki = new KrillIndex(); |
| ki.addDoc(createDoc1()); |
| ki.addDoc(createDoc2()); |
| ki.addDoc(createDoc3()); |
| ki.commit(); |
| CollectionBuilder cb = new CollectionBuilder(); |
| KrillCollection kcn = new KrillCollection(ki); |
| |
| kcn.fromBuilder(cb.date("pubDate", "2005")); |
| assertEquals(3, kcn.docCount()); |
| kcn.fromBuilder(cb.date("pubDate", "2005-12")); |
| assertEquals(3, kcn.docCount()); |
| |
| kcn.fromBuilder(cb.date("pubDate", "2005-12-10")); |
| assertEquals(1, kcn.docCount()); |
| kcn.fromBuilder(cb.date("pubDate", "2005-12-16")); |
| assertEquals(1, kcn.docCount()); |
| kcn.fromBuilder(cb.date("pubDate", "2005-12-07")); |
| assertEquals(1, kcn.docCount()); |
| |
| kcn.fromBuilder(cb.since("pubDate", "2005-12-07")); |
| assertEquals(3, kcn.docCount()); |
| kcn.fromBuilder(cb.since("pubDate", "2005-12-10")); |
| assertEquals(2, kcn.docCount()); |
| kcn.fromBuilder(cb.since("pubDate", "2005-12-16")); |
| assertEquals(1, kcn.docCount()); |
| |
| kcn.fromBuilder(cb.till("pubDate", "2005-12-16")); |
| assertEquals(3, kcn.docCount()); |
| kcn.fromBuilder(cb.till("pubDate", "2005-12-10")); |
| assertEquals(2, kcn.docCount()); |
| kcn.fromBuilder(cb.till("pubDate", "2005-12-07")); |
| assertEquals(1, kcn.docCount()); |
| |
| kcn.fromBuilder(cb.date("pubDate", "2005-12-10").not()); |
| assertEquals(2, kcn.docCount()); |
| kcn.fromBuilder(cb.date("pubDate", "2005-12-16").not()); |
| assertEquals(2, kcn.docCount()); |
| kcn.fromBuilder(cb.date("pubDate", "2005-12-07").not()); |
| assertEquals(2, kcn.docCount()); |
| kcn.fromBuilder(cb.date("pubDate", "2005-12-09").not()); |
| assertEquals(3, kcn.docCount()); |
| |
| |
| kcn.fromBuilder(cb.till("pubDate", "2005-12-16").not()); |
| assertEquals(0, kcn.docCount()); |
| kcn.fromBuilder(cb.till("pubDate", "2005-12-15").not()); |
| assertEquals(1, kcn.docCount()); |
| kcn.fromBuilder(cb.till("pubDate", "2005-12-10").not()); |
| assertEquals(1, kcn.docCount()); |
| kcn.fromBuilder(cb.till("pubDate", "2005-12-09").not()); |
| assertEquals(2, kcn.docCount()); |
| kcn.fromBuilder(cb.till("pubDate", "2005-12-07").not()); |
| assertEquals(2, kcn.docCount()); |
| kcn.fromBuilder(cb.till("pubDate", "2005-12-06").not()); |
| assertEquals(3, kcn.docCount()); |
| }; |
| |
| |
| @Test |
| public void testIndexWithRegexes () throws IOException { |
| ki = new KrillIndex(); |
| |
| ki.addDoc(createDoc1()); |
| ki.addDoc(createDoc2()); |
| ki.addDoc(createDoc3()); |
| ki.commit(); |
| |
| CollectionBuilder cb = new CollectionBuilder(); |
| KrillCollection kcn = new KrillCollection(ki); |
| |
| kcn.fromBuilder(cb.re("author", "Fran.*")); |
| assertEquals(1, kcn.docCount()); |
| kcn.fromBuilder(cb.re("author", "Blin.*")); |
| assertEquals(0, kcn.docCount()); |
| kcn.fromBuilder(cb.re("author", "Frank|Peter")); |
| assertEquals(2, kcn.docCount()); |
| |
| // "Frau" doesn't work! |
| kcn.fromBuilder(cb.term("text", "frau")); |
| assertEquals(1, kcn.docCount()); |
| |
| kcn.fromBuilder(cb.re("text", "frau")); |
| assertEquals(1, kcn.docCount()); |
| |
| kcn.fromBuilder(cb.re("text", "frau|mann")); |
| assertEquals(3, kcn.docCount()); |
| }; |
| |
| |
| @Test |
| public void filterExampleFromLegacy () throws Exception { |
| |
| // Construct index |
| KrillIndex ki = new KrillIndex(); |
| // Indexing test files |
| for (String i : new String[] { "00001", "00002", "00003", "00004", |
| "00005", "00006", "02439" }) { |
| ki.addDoc( |
| getClass().getResourceAsStream("/wiki/" + i + ".json.gz"), |
| true); |
| }; |
| ki.commit(); |
| |
| // Create Virtual collections: |
| KrillCollection kc = new KrillCollection(ki); |
| |
| assertEquals("Documents", 7, kc.numberOf("documents")); |
| |
| // The virtual collection consists of all documents that have |
| // the textClass "reisen" and "freizeit" |
| |
| /* kc.filter(kf.and("textClass", "reisen").and("textClass", |
| "freizeit-unterhaltung")); |
| */ |
| |
| kc.fromBuilder(kc.build().andGroup() |
| .with(kc.build().term("textClass", "reisen")) |
| .with(kc.build().term("textClass", "freizeit-unterhaltung"))); |
| |
| assertEquals("Documents", 5, kc.numberOf("documents")); |
| assertEquals("Tokens", 1678, kc.numberOf("tokens")); |
| assertEquals("Sentences", 194, kc.numberOf("sentences")); |
| assertEquals("Paragraphs", 139, kc.numberOf("paragraphs")); |
| |
| |
| // Subset this to all documents that have also the text |
| // kc.filter(kf.and("textClass", "kultur")); |
| /* |
| kc.fromBuilder( |
| kc.build().andGroup().with( |
| kc.getBuilder() |
| ).with( |
| kc.build().term("textClass", "kultur") |
| ) |
| ); |
| */ |
| |
| kc.filter(kc.build().term("textClass", "kultur")); |
| |
| assertEquals("Documents", 1, kc.numberOf("documents")); |
| assertEquals("Tokens", 405, kc.numberOf("tokens")); |
| assertEquals("Sentences", 75, kc.numberOf("sentences")); |
| assertEquals("Paragraphs", 48, kc.numberOf("paragraphs")); |
| |
| |
| // kc.filter(kf.and("corpusID", "WPD")); |
| kc.filter(kc.build().term("corpusID", "WPD")); |
| |
| assertEquals("Documents", 1, kc.numberOf("documents")); |
| assertEquals("Tokens", 405, kc.numberOf("tokens")); |
| assertEquals("Sentences", 75, kc.numberOf("sentences")); |
| assertEquals("Paragraphs", 48, kc.numberOf("paragraphs")); |
| |
| // Create a query |
| QueryBuilder kq = new QueryBuilder("tokens"); |
| SpanQuery query = kq.seg("opennlp/p:NN").with("tt/p:NN").toQuery(); |
| |
| Result kr = ki.search(kc, query, 0, (short) 20, true, (short) 5, true, |
| (short) 5); |
| assertEquals(kr.getTotalResults(), 70); |
| |
| |
| kc.extend(kc.build().term("textClass", "uninteresting")); |
| assertEquals("Documents", 1, kc.numberOf("documents")); |
| |
| kc.extend(kc.build().term("textClass", "wissenschaft")); |
| |
| assertEquals("Documents", 3, kc.numberOf("documents")); |
| assertEquals("Tokens", 1669, kc.numberOf("tokens")); |
| assertEquals("Sentences", 188, kc.numberOf("sentences")); |
| assertEquals("Paragraphs", 130, kc.numberOf("paragraphs")); |
| // System.err.println(kr.toJSON()); |
| }; |
| |
| |
| @Test |
| public void filterExampleAtomicLegacy () throws Exception { |
| |
| // That's exactly the same test class, but with multiple atomic indices |
| |
| // Construct index |
| KrillIndex ki = new KrillIndex(); |
| // Indexing test files |
| for (String i : new String[] { "00001", "00002", "00003", "00004", |
| "00005", "00006", "02439" }) { |
| ki.addDoc( |
| getClass().getResourceAsStream("/wiki/" + i + ".json.gz"), |
| true); |
| ki.commit(); |
| }; |
| |
| CollectionBuilder kf = new CollectionBuilder(); |
| |
| // Create Virtual collections: |
| KrillCollection kc = new KrillCollection(ki); |
| |
| assertEquals("Documents", 7, kc.numberOf("documents")); |
| |
| // If this is set - everything is fine automatically ... |
| kc.filter(kc.build().term("corpusID", "WPD")); |
| |
| assertEquals("Documents", 7, kc.numberOf("documents")); |
| |
| // The virtual collection consists of all documents that have the textClass "reisen" and "freizeit" |
| |
| /* |
| kc.filter(kf.and("textClass", "reisen").and("textClass", |
| "freizeit-unterhaltung")); |
| */ |
| kc.filter(kc.build().andGroup() |
| .with(kc.build().term("textClass", "reisen")) |
| .with(kc.build().term("textClass", "freizeit-unterhaltung"))); |
| |
| assertEquals("Documents", 5, kc.numberOf("documents")); |
| assertEquals("Tokens", 1678, kc.numberOf("tokens")); |
| assertEquals("Sentences", 194, kc.numberOf("sentences")); |
| assertEquals("Paragraphs", 139, kc.numberOf("paragraphs")); |
| |
| // Subset this to all documents that have also the text |
| // kc.filter(kf.and("textClass", "kultur")); |
| |
| kc.filter(kc.build().term("textClass", "kultur")); |
| |
| assertEquals("Documents", 1, kc.numberOf("documents")); |
| assertEquals("Tokens", 405, kc.numberOf("tokens")); |
| assertEquals("Sentences", 75, kc.numberOf("sentences")); |
| assertEquals("Paragraphs", 48, kc.numberOf("paragraphs")); |
| |
| // This is already filtered though ... |
| // kc.filter(kf.and("corpusID", "WPD")); |
| kc.filter(kc.build().term("corpusID", "WPD")); |
| |
| assertEquals("Documents", 1, kc.numberOf("documents")); |
| assertEquals("Tokens", 405, kc.numberOf("tokens")); |
| assertEquals("Sentences", 75, kc.numberOf("sentences")); |
| assertEquals("Paragraphs", 48, kc.numberOf("paragraphs")); |
| |
| // Create a query |
| QueryBuilder kq = new QueryBuilder("tokens"); |
| SpanQuery query = kq.seg("opennlp/p:NN").with("tt/p:NN").toQuery(); |
| |
| Result kr = ki.search(kc, query, 0, (short) 20, true, (short) 5, true, |
| (short) 5); |
| assertEquals(kr.getTotalResults(), 70); |
| |
| // kc.extend(kf.and("textClass", "uninteresting")); |
| kc.extend(kc.build().term("textClass", "uninteresting")); |
| |
| assertEquals("Documents", 1, kc.numberOf("documents")); |
| |
| kc.extend(kc.build().term("textClass", "wissenschaft")); |
| |
| assertEquals("Documents", 3, kc.numberOf("documents")); |
| assertEquals("Tokens", 1669, kc.numberOf("tokens")); |
| assertEquals("Sentences", 188, kc.numberOf("sentences")); |
| assertEquals("Paragraphs", 130, kc.numberOf("paragraphs")); |
| |
| // System.err.println(kc.toString()); |
| // Test collectionbuilder simplifier! |
| /* |
| OrGroup( |
| AndGroup( |
| corpusID:WPD |
| textClass:reisen |
| textClass:freizeit-unterhaltung |
| textClass:kultur |
| corpusID:WPD |
| ) |
| textClass:uninteresting |
| textClass:wissenschaft |
| ) |
| */ |
| |
| assertTrue(ki.delDocs("textClass", "wissenschaft")); |
| ki.commit(); |
| |
| assertEquals("Documents", 1, kc.numberOf("documents")); |
| assertEquals("Tokens", 405, kc.numberOf("tokens")); |
| assertEquals("Sentences", 75, kc.numberOf("sentences")); |
| assertEquals("Paragraphs", 48, kc.numberOf("paragraphs")); |
| }; |
| |
| |
| @Test |
| public void filterExample2Legacy () throws Exception { |
| |
| // Construct index |
| KrillIndex ki = new KrillIndex(); |
| // Indexing test files |
| for (String i : new String[] { "00001", "00002", "00003", "00004", |
| "00005", "00006", "02439" }) { |
| ki.addDoc( |
| getClass().getResourceAsStream("/wiki/" + i + ".json.gz"), |
| true); |
| }; |
| ki.commit(); |
| |
| ki.addDoc(getClass() |
| .getResourceAsStream("/wiki/00012-fakemeta.json.gz"), true); |
| |
| ki.commit(); |
| |
| /* |
| CollectionBuilderLegacy kf = new CollectionBuilderLegacy(); |
| |
| // Create Virtual collections: |
| KrillCollectionLegacy kc = new KrillCollectionLegacy(ki); |
| kc.filter(kf.and("textClass", "reisen").and("textClass", |
| "freizeit-unterhaltung")); |
| */ |
| |
| KrillCollection kc = new KrillCollection(ki); |
| CollectionBuilder cb = kc.build(); |
| kc.filter(cb.andGroup().with(cb.term("textClass", "reisen")) |
| .with(cb.term("textClass", "freizeit-unterhaltung"))); |
| |
| assertEquals("Documents", 5, kc.numberOf("documents")); |
| assertEquals("Tokens", 1678, kc.numberOf("tokens")); |
| assertEquals("Sentences", 194, kc.numberOf("sentences")); |
| assertEquals("Paragraphs", 139, kc.numberOf("paragraphs")); |
| |
| |
| // Create a query |
| QueryBuilder kq = new QueryBuilder("tokens"); |
| SpanQuery query = kq.seg("opennlp/p:NN").with("tt/p:NN").toQuery(); |
| |
| |
| Result kr = ki.search(kc, query, 0, (short) 20, true, (short) 5, true, |
| (short) 5); |
| assertEquals(kr.getTotalResults(), 369); |
| |
| // kc.filter(kf.and("corpusID", "QQQ")); |
| kc.filter(cb.term("corpusID", "QQQ")); |
| |
| assertEquals("Documents", 0, kc.numberOf("documents")); |
| assertEquals("Tokens", 0, kc.numberOf("tokens")); |
| assertEquals("Sentences", 0, kc.numberOf("sentences")); |
| assertEquals("Paragraphs", 0, kc.numberOf("paragraphs")); |
| |
| kr = ki.search(kc, query, 0, (short) 20, true, (short) 5, true, |
| (short) 5); |
| assertEquals(kr.getTotalResults(), 0); |
| }; |
| |
| |
| @Test |
| public void uidCollectionLegacy () throws IOException { |
| |
| // Construct index |
| KrillIndex ki = new KrillIndex(); |
| // Indexing test files |
| int uid = 1; |
| for (String i : new String[] { "00001", "00002", "00003", "00004", |
| "00005", "00006", "02439" }) { |
| FieldDocument fd = ki.addDoc(uid++, |
| getClass().getResourceAsStream("/wiki/" + i + ".json.gz"), |
| true); |
| }; |
| ki.commit(); |
| |
| assertEquals("Documents", 7, ki.numberOf("documents")); |
| assertEquals("Paragraphs", 174, ki.numberOf("paragraphs")); |
| assertEquals("Sentences", 281, ki.numberOf("sentences")); |
| assertEquals("Tokens", 2661, ki.numberOf("tokens")); |
| |
| SpanQuery sq = new SpanTermQuery(new Term("tokens", "s:der")); |
| Result kr = ki.search(sq, (short) 10); |
| assertEquals(86, kr.getTotalResults()); |
| |
| // Create Virtual collections: |
| KrillCollection kc = new KrillCollection(); |
| kc.filterUIDs(new String[] { "2", "3", "4" }); |
| kc.setIndex(ki); |
| assertEquals("Documents", 3, kc.numberOf("documents")); |
| |
| assertEquals("Paragraphs", 46, kc.numberOf("paragraphs")); |
| assertEquals("Sentences", 103, kc.numberOf("sentences")); |
| assertEquals("Tokens", 1229, kc.numberOf("tokens")); |
| |
| kr = ki.search(kc, sq, 0, (short) 20, true, (short) 5, true, (short) 5); |
| |
| assertEquals((long) 39, kr.getTotalResults()); |
| }; |
| |
| |
| @Test |
| public void uidCollectionWithDeletions () throws IOException { |
| |
| // Construct index |
| KrillIndex ki = new KrillIndex(); |
| // Indexing test files |
| int uid = 1; |
| for (String i : new String[] { "00001", "00002", "00003", "00004", |
| "00005", "00006", "02439" }) { |
| FieldDocument fd = ki.addDoc(uid++, |
| getClass().getResourceAsStream("/wiki/" + i + ".json.gz"), |
| true); |
| }; |
| ki.commit(); |
| |
| |
| assertEquals("Documents", 7, ki.numberOf("documents")); |
| assertEquals("Paragraphs", 174, ki.numberOf("paragraphs")); |
| assertEquals("Sentences", 281, ki.numberOf("sentences")); |
| assertEquals("Tokens", 2661, ki.numberOf("tokens")); |
| |
| assertTrue(ki.delDoc(3)); |
| ki.commit(); |
| |
| assertEquals("Documents", 6, ki.numberOf("documents")); |
| |
| assertEquals("Paragraphs", 146, ki.numberOf("paragraphs")); |
| assertEquals("Sentences", 212, ki.numberOf("sentences")); |
| assertEquals("Tokens", 2019, ki.numberOf("tokens")); |
| |
| assertTrue(ki.delDoc(2)); |
| assertTrue(ki.delDoc(3)); |
| assertTrue(ki.delDoc(4)); |
| assertTrue(ki.delDoc(5)); |
| assertTrue(ki.delDoc(6)); |
| assertTrue(ki.delDoc(7)); |
| ki.commit(); |
| |
| assertEquals("Documents", 1, ki.numberOf("documents")); |
| assertEquals("Paragraphs", 75, ki.numberOf("paragraphs")); |
| }; |
| |
| |
| private FieldDocument createDoc1 () { |
| FieldDocument fd = new FieldDocument(); |
| fd.addString("ID", "doc-1"); |
| fd.addString("author", "Frank"); |
| fd.addKeyword("textClass", "Nachricht Kultur Reisen"); |
| fd.addInt("pubDate", 20051210); |
| fd.addText("text", "Der alte Mann ging über die Straße"); |
| return fd; |
| }; |
| |
| |
| private FieldDocument createDoc2 () { |
| FieldDocument fd = new FieldDocument(); |
| fd.addString("ID", "doc-2"); |
| fd.addString("author", "Peter"); |
| fd.addKeyword("textClass", "Kultur Reisen"); |
| fd.addInt("pubDate", 20051207); |
| fd.addText("text", "Der junge Mann hatte keine andere Wahl"); |
| return fd; |
| }; |
| |
| |
| private FieldDocument createDoc3 () { |
| FieldDocument fd = new FieldDocument(); |
| fd.addString("ID", "doc-3"); |
| fd.addString("author", "Sebastian"); |
| fd.addKeyword("textClass", "Reisen Finanzen"); |
| fd.addInt("pubDate", 20051216); |
| fd.addText("text", "Die Frau und der Mann küssten sich"); |
| return fd; |
| }; |
| }; |