Described snippet building algorithm and fixed repetition constructor
Change-Id: Id7a38c611645e685b92274ddba77f3462aad9a39
diff --git a/src/main/java/de/ids_mannheim/korap/query/wrap/SpanRepetitionQueryWrapper.java b/src/main/java/de/ids_mannheim/korap/query/wrap/SpanRepetitionQueryWrapper.java
index 808b064..ec88b04 100644
--- a/src/main/java/de/ids_mannheim/korap/query/wrap/SpanRepetitionQueryWrapper.java
+++ b/src/main/java/de/ids_mannheim/korap/query/wrap/SpanRepetitionQueryWrapper.java
@@ -27,10 +27,12 @@
// This is for exact enumbered repetition, like in a{3}
public SpanRepetitionQueryWrapper (SpanQueryWrapper subquery, int exact) {
+
if (!subquery.isEmpty()) {
this.subquery = subquery;
if (subquery.maybeUnsorted())
this.maybeUnsorted = true;
+ this.isEmpty = false;
}
else
this.isEmpty = true;
@@ -43,6 +45,7 @@
return;
};
+ this.isNull = false;
this.min = exact;
this.max = exact;
};
diff --git a/src/main/java/de/ids_mannheim/korap/query/wrap/SpanSequenceQueryWrapper.java b/src/main/java/de/ids_mannheim/korap/query/wrap/SpanSequenceQueryWrapper.java
index a1edeef..b457d77 100644
--- a/src/main/java/de/ids_mannheim/korap/query/wrap/SpanSequenceQueryWrapper.java
+++ b/src/main/java/de/ids_mannheim/korap/query/wrap/SpanSequenceQueryWrapper.java
@@ -675,7 +675,6 @@
// No real sequence - only one element
if (size == 1) {
-
// But the element may be expanded
if (this.segments.get(0).isExtended()
&& (this.hasConstraints() || !this.isInOrder())) {
diff --git a/src/main/java/de/ids_mannheim/korap/response/Match.java b/src/main/java/de/ids_mannheim/korap/response/Match.java
index 5714bf3..890b2a8 100644
--- a/src/main/java/de/ids_mannheim/korap/response/Match.java
+++ b/src/main/java/de/ids_mannheim/korap/response/Match.java
@@ -34,13 +34,38 @@
import de.ids_mannheim.korap.response.match.Relation;
/*
- Todo: The implemented classes and private names are horrible!
- Refactor, future-me!
+ * The snippet building algorithm is quite complicated for now
+ * and should probably be refactored.
+ * It works like this:
+ *
+ * 1. For all spans and highlights, pagebreaks etc. all necessary
+ * positions are collected (processHighlight)
+ * 2. For all collected positions the character offsets are retrieved
+ * and based on that for all spans and highlights a list
+ * is created with arrays of the spans with the structure
+ * [startchar, endchar, highlightClass] (processHighlightSpans)
+ * 2.1 The primary data and optional context information is retrieved
+ * (processOffsetChars)
+ * 3. Based on the collected spans 2 lists are created for opening and
+ * closing tags (pretty much clones of the initial span list),
+ * sorted for opening resp. closing, and processed in parallel
+ * to form an open/close stack. The new structure on the stack is
+ * [startchar, endchar, highlightclass, open=1/close=0]
+ * (processHighlightStack)
+ * 3.1. If the element is a relation with an identifier, this may
+ * be removed if duplicate (filterMultipleIdentifiers)
+ * 4. Based on the stack and the primary data the snippet is created.
+ * (processHighlightSnippet)
+ */
- The number based Highlighttype is ugly - UGLY!
-
- substrings may be out of range - e.g. if snippets are not lifted!
-*/
+/*
+ * Todo: The implemented classes and private names are horrible!
+ * Refactor, future-me!
+ *
+ * The number based Highlighttype is ugly - UGLY!
+ *
+ * substrings may be out of range - e.g. if snippets are not lifted!
+ */
/**
* Representation of Matches in a Result.
@@ -406,8 +431,8 @@
};
- public void addPagebreak (int start, int number) {
- this.addHighlight(new Highlight(start, number));
+ public void addPagebreak (int start, int pagenumber) {
+ this.addHighlight(new Highlight(start, pagenumber));
};
/**
@@ -763,8 +788,8 @@
if (DEBUG)
log.debug("Add pagebreak to list");
- charOffset = bb.getInt();
pagenumber = bb.getInt();
+ charOffset = bb.getInt();
// This is the first pagebreak!
pagebreaks.add(new int[]{charOffset, pagenumber});
@@ -781,8 +806,8 @@
bb.put(b);
bb.rewind();
- charOffset = bb.getInt();
pagenumber = bb.getInt();
+ charOffset = bb.getInt();
// This is the first pagebreak!
pagebreaks.add(new int[]{charOffset, pagenumber});
@@ -804,9 +829,9 @@
};
if (pagebreaks.size() > 0) {
- this.startPage = pagebreaks.get(0)[0];
+ this.startPage = pagebreaks.get(0)[1];
if (pagebreaks.size() > 1 && pagebreaks.get(pagebreaks.size()-1) != null)
- this.endPage = pagebreaks.get(pagebreaks.size()-1)[0];
+ this.endPage = pagebreaks.get(pagebreaks.size()-1)[1];
}
return pagebreaks;
@@ -1327,6 +1352,7 @@
// check if the opener is smaller than the closener
if (openList.peekFirst()[0] < closeList.peekFirst()[1]) {
+
int[] e = openList.removeFirst().clone();
// Mark as opener
@@ -1335,8 +1361,7 @@
// Add opener to stack
stack.add(e);
}
- else {
-
+ else {
// Add closener to stack
stack.add(closeList.removeFirst());
};
@@ -1417,8 +1442,11 @@
int startOffsetChar = startPosChar - intArray[0];
// Add match span, in case no inner match is defined
- if (this.innerMatchEndPos == -1)
+ if (this.innerMatchEndPos == -1) {
+ if (DEBUG)
+ log.debug("Added array to span with {} (1)", intArray);
this.span.add(intArray);
+ };
// highlights
// -- I'm not sure about this.
@@ -1467,6 +1495,9 @@
0 // Dummy value for later use
};
+ if (DEBUG)
+ log.debug("Added array to span with {} (2)", intArray);
+
this.span.add(intArray);
};
};
diff --git a/src/test/java/de/ids_mannheim/korap/index/TestPagebreakIndex.java b/src/test/java/de/ids_mannheim/korap/index/TestPagebreakIndex.java
index ca4905b..dae3aa9 100644
--- a/src/test/java/de/ids_mannheim/korap/index/TestPagebreakIndex.java
+++ b/src/test/java/de/ids_mannheim/korap/index/TestPagebreakIndex.java
@@ -23,6 +23,8 @@
import de.ids_mannheim.korap.query.SpanFocusQuery;
import de.ids_mannheim.korap.query.SpanNextQuery;
import de.ids_mannheim.korap.query.SpanWithinQuery;
+import de.ids_mannheim.korap.query.QueryBuilder;
+import de.ids_mannheim.korap.query.wrap.SpanQueryWrapper;
import de.ids_mannheim.korap.response.Match;
import de.ids_mannheim.korap.response.Result;
import de.ids_mannheim.korap.response.SearchContext;
@@ -35,7 +37,7 @@
public class TestPagebreakIndex {
@Test
- public void indexExample1 () throws IOException {
+ public void indexExample1 () throws Exception {
KrillIndex ki = new KrillIndex();
// abcabcabac
@@ -55,15 +57,14 @@
ki.commit();
SpanQuery sq = new SpanTermQuery(new Term("tokens", "s:c"));
-
Result kr = ki.search(sq, (short) 10);
-
+
assertEquals(528, kr.getMatch(0).getStartPage());
assertEquals(-1, kr.getMatch(0).getEndPage());
assertEquals(
"snippetHTML",
"<span class=\"context-left\">"+
- "<span class=\"pb\" data-after=\"528\"></span>"+
+ // "<span class=\"pb\" data-after=\"528\"></span>"+
"ab"+
"</span>"+
"<span class=\"match\">"+
@@ -73,13 +74,25 @@
"</span>"+
"<span class=\"context-right\">"+
"ab"+
- "<span class=\"pb\" data-after=\"528\"></span>"+
+ // "<span class=\"pb\" data-after=\"528\"></span>"+
"cab"+
- "<span class=\"pb\" data-after=\"528\"></span>"+
+ // "<span class=\"pb\" data-after=\"528\"></span>"+
"a"+
"<span class=\"more\">"+
"</span>"+
"</span>",
kr.getMatch(0).getSnippetHTML());
+
+
+ QueryBuilder qb = new QueryBuilder("tokens");
+ sq = qb.seq().append(
+ qb.repeat(
+ qb.seq().append(qb.seg("s:a")).append(qb.seg("s:b")).append(qb.seg("s:c")),
+ 2
+ )
+ ).append(qb.seg("s:a"))
+ .toQuery();
+
+ assertEquals(sq.toString(), "spanNext(spanRepetition(spanNext(spanNext(tokens:s:a, tokens:s:b), tokens:s:c){2,2}), tokens:s:a)");
};
};