DIRTY DEMO HACKS
diff --git a/src/main/java/de/ids_mannheim/korap/KorapCollection.java b/src/main/java/de/ids_mannheim/korap/KorapCollection.java
index b2b47b8..6c33c7c 100644
--- a/src/main/java/de/ids_mannheim/korap/KorapCollection.java
+++ b/src/main/java/de/ids_mannheim/korap/KorapCollection.java
@@ -117,7 +117,7 @@
log.trace("Added extension: {}", filter.toString());
this.filter.add(
new FilterOperation(
- (Filter) new QueryWrapperFilter(filter.toQuery()),
+ (Filter) new QueryWrapperFilter(filter.toQuery()),
true
)
);
@@ -142,9 +142,6 @@
public FixedBitSet bits (AtomicReaderContext atomic) throws IOException {
/*
- TODO:
- Don't check the live docs in advance - combine them afterwards with an "and" operation,
- so before this you can fully use "and" and "or" on an empty bitset.
Use Bits.MatchAllBits(int len)
*/
@@ -159,13 +156,14 @@
FilterOperation kcInit = filters.remove(0);
log.trace("FILTER: {}", kcInit);
-
// Init vector
DocIdSet docids = kcInit.filter.getDocIdSet(atomic, null);
+
DocIdSetIterator filterIter = docids.iterator();
if (filterIter != null) {
log.trace("InitFilter has effect");
+ // System.err.println("Init has an effect");
bitset.or(filterIter);
noDoc = false;
};
@@ -183,11 +181,18 @@
if (kc.isFilter()) {
bitset.clear(0, bitset.length());
noDoc = true;
+ }
+ else {
+ // System.err.println("No term found");
};
continue;
};
if (kc.isExtension()) {
+ // System.err.println("Term found!");
+ // log.trace("Extend filter");
+ // System.err.println("Old Card:" + bitset.cardinality());
bitset.or(filterIter);
+ // System.err.println("New Card:" + bitset.cardinality());
}
else {
bitset.and(filterIter);
diff --git a/src/main/java/de/ids_mannheim/korap/KorapIndex.java b/src/main/java/de/ids_mannheim/korap/KorapIndex.java
index 2c5ae31..3f93cd1 100644
--- a/src/main/java/de/ids_mannheim/korap/KorapIndex.java
+++ b/src/main/java/de/ids_mannheim/korap/KorapIndex.java
@@ -332,8 +332,13 @@
};
long docCount = 0;
+ // System.err.println("CHECK");
+ int i = 1;
for (AtomicReaderContext atomic : this.reader().leaves()) {
+ // System.err.println("READER" + i + "a-" + docCount);
docCount += collection.bits(atomic).cardinality();
+ // System.err.println("READER" + i + "b-" + docCount);
+ i++;
};
return docCount;
};
diff --git a/src/main/java/de/ids_mannheim/korap/KorapQuery.java b/src/main/java/de/ids_mannheim/korap/KorapQuery.java
index b76b7b8..0ee3895 100644
--- a/src/main/java/de/ids_mannheim/korap/KorapQuery.java
+++ b/src/main/java/de/ids_mannheim/korap/KorapQuery.java
@@ -171,12 +171,18 @@
};
value = value.replace("'", "").replace("\"", "");
+ // Temporary
+ value = value.replace("_", "/");
+
return this.seg(this.re(value));
};
if (!value.matches("[^:]+?:.+"))
value = "s:" + value;
+ // Temporary
+ value = value.replace("_", "/");
+
return this.seg(value);
case "!=":
diff --git a/src/main/resources/log4j.properties b/src/main/resources/log4j.properties
index a4c83b7..e050a57 100644
--- a/src/main/resources/log4j.properties
+++ b/src/main/resources/log4j.properties
@@ -11,7 +11,7 @@
#log4j.logger.de.ids_mannheim.korap.query.spans.MatchSpans = TRACE, stdout
#log4j.logger.de.ids_mannheim.korap.KorapIndex = TRACE, stdout
#log4j.logger.de.ids_mannheim.korap.KorapMatch = TRACE, stdout
-#log4j.logger.de.ids_mannheim.korap.KorapCollection = TRACE, stdout
+# log4j.logger.de.ids_mannheim.korap.KorapCollection = TRACE, stdout
#log4j.logger.de.ids_mannheim.korap.index.PositionsToOffset = TRACE, stdout
# log4j.logger.de.ids_mannheim.korap.analysis.MultiTermTokenStream = TRACE, stdout
diff --git a/src/test/java/de/ids_mannheim/korap/filter/TestKorapCollection.java b/src/test/java/de/ids_mannheim/korap/filter/TestKorapCollection.java
index bcd9ed3..54864aa 100644
--- a/src/test/java/de/ids_mannheim/korap/filter/TestKorapCollection.java
+++ b/src/test/java/de/ids_mannheim/korap/filter/TestKorapCollection.java
@@ -70,6 +70,7 @@
assertEquals("Documents", 1, kc.numberOf("documents"));
kc.extend( kf.and("textClass", "wissenschaft") );
+
assertEquals("Documents", 3, kc.numberOf("documents"));
assertEquals("Tokens", 1669, kc.numberOf("tokens"));
assertEquals("Sentences", 188, kc.numberOf("sentences"));
@@ -77,6 +78,78 @@
// System.err.println(kr.toJSON());
};
+ @Ignore
+ public void filterExampleAtomic () throws IOException {
+
+ // That's exactly the same test class, but with multiple atomic indices
+
+ // Construct index
+ KorapIndex ki = new KorapIndex();
+ // Indexing test files
+ for (String i : new String[] {"00001", "00002", "00003", "00004", "00005", "00006", "02439"}) {
+ FieldDocument fd = ki.addDocFile(
+ getClass().getResource("/wiki/" + i + ".json.gz").getFile(), true
+ );
+ ki.commit();
+ };
+
+ KorapFilter kf = new KorapFilter();
+
+ // Create Virtual collections:
+ KorapCollection kc = new KorapCollection(ki);
+
+ assertEquals("Documents", 7, kc.numberOf("documents"));
+
+ /*
+ If this is set - everything is fine automatically ...
+ kc.filter(kf.and("corpusID", "WPD"));
+ assertEquals("Documents", 7, kc.numberOf("documents"));
+ */
+
+ // The virtual collection consists of all documents that have the textClass "reisen" and "freizeit"
+ kc.filter( kf.and("textClass", "reisen").and("textClass", "freizeit-unterhaltung") );
+
+ assertEquals("Documents", 5, kc.numberOf("documents"));
+ assertEquals("Tokens", 1678, kc.numberOf("tokens"));
+ assertEquals("Sentences", 194, kc.numberOf("sentences"));
+ assertEquals("Paragraphs", 139, kc.numberOf("paragraphs"));
+
+ // Subset this to all documents that have also the text
+ kc.filter(kf.and("textClass", "kultur"));
+
+ assertEquals("Documents", 1, kc.numberOf("documents"));
+ assertEquals("Tokens", 405, kc.numberOf("tokens"));
+ assertEquals("Sentences", 75, kc.numberOf("sentences"));
+ assertEquals("Paragraphs", 48, kc.numberOf("paragraphs"));
+
+ kc.filter(kf.and("corpusID", "WPD"));
+
+ assertEquals("Documents", 1, kc.numberOf("documents"));
+ assertEquals("Tokens", 405, kc.numberOf("tokens"));
+ assertEquals("Sentences", 75, kc.numberOf("sentences"));
+ assertEquals("Paragraphs", 48, kc.numberOf("paragraphs"));
+
+ // Create a query
+ KorapQuery kq = new KorapQuery("tokens");
+ SpanQuery query = kq.seg("opennlp/p:NN").with("tt/p:NN").toQuery();
+
+ KorapResult kr = kc.search(query);
+ assertEquals(70, kr.totalResults());
+
+ kc.extend( kf.and("textClass", "uninteresting") );
+ assertEquals("Documents", 1, kc.numberOf("documents"));
+
+ kc.extend( kf.and("textClass", "wissenschaft") );
+
+ System.err.println("+++++++++++++++++++++++");
+ assertEquals("Documents", 3, kc.numberOf("documents"));
+ assertEquals("Tokens", 1669, kc.numberOf("tokens"));
+ assertEquals("Sentences", 188, kc.numberOf("sentences"));
+ assertEquals("Paragraphs", 130, kc.numberOf("paragraphs"));
+ // System.err.println(kr.toJSON());
+ };
+
+
@Test
public void filterExample2 () throws IOException {
diff --git a/src/test/java/de/ids_mannheim/korap/query/TestKorapQueryJSON.java b/src/test/java/de/ids_mannheim/korap/query/TestKorapQueryJSON.java
index c995068..c62a71a 100644
--- a/src/test/java/de/ids_mannheim/korap/query/TestKorapQueryJSON.java
+++ b/src/test/java/de/ids_mannheim/korap/query/TestKorapQueryJSON.java
@@ -172,6 +172,14 @@
assertEquals(sqwi.toQuery().toString(), "spanWithin(<tokens:np />, tokens:"+defaultFoundry+"l:Mann)");
};
+ @Test
+ public void queryJSONDemo () throws QueryException {
+ SpanQueryWrapperInterface sqwi = new KorapQuery("tokens").fromJSON("{ \"query\" : { \"@type\" : \"korap:token\", \"@value\" : { \"@type\" : \"korap:term\", \"@value\" : \"base_p:foo\", \"relation\" : \"=\" }}}");
+
+ assertEquals(sqwi.toQuery().toString(), "tokens:base/p:foo");
+ };
+
+
public static String getString (String path) {
StringBuilder contentBuilder = new StringBuilder();
try {
diff --git a/src/test/resources/wiki/readme.txt b/src/test/resources/wiki/readme.txt
index 32fc818..6f60d05 100644
--- a/src/test/resources/wiki/readme.txt
+++ b/src/test/resources/wiki/readme.txt
@@ -3,5 +3,5 @@
00003: kultur,musik
00004: wissenschaft,populaerwissenschaft
00005: freizeit-unterhaltung,reisen
-00006: freizeit-unterhaltung,reisen
-02439: kultur,musik,freizeit-unterhaltung,reisen
+00006: freizeit-unterhaltung,reisen (WPD)
+02439: kultur,musik,freizeit-unterhaltung,reisen (WPD)