Support general inline markers (instead of just pagebreaks) (fixes #132)
Change-Id: If7318080d5195387cd414e1741d87d032c817d7a
diff --git a/Changes b/Changes
index 601eb41..6c66f00 100644
--- a/Changes
+++ b/Changes
@@ -1,7 +1,8 @@
-0.62.4 2024-05-22
+0.62.4 2024-05-27
- [feature] Make match and context size configurable (address #128,
diewald & margaretha)
- [enhancement] Separate max length for token and char context (margaretha)
+ - [feature] Support for inline markers (fixes #132, diewald)
0.62.3 2024-04-16
- [cleanup] Added getDocBitsSupplier to VirtualCorpusFilter (margaretha)
diff --git a/src/main/java/de/ids_mannheim/korap/KrillIndex.java b/src/main/java/de/ids_mannheim/korap/KrillIndex.java
index d1e3cde..3f68608 100644
--- a/src/main/java/de/ids_mannheim/korap/KrillIndex.java
+++ b/src/main/java/de/ids_mannheim/korap/KrillIndex.java
@@ -1584,7 +1584,8 @@
// Add snippet if existing
if (snippets) {
match.setContext(kr.getContext());
- match.retrievePagebreaks("~:base/s:pb");
+ match.retrieveMarkers("~:base/s:pb");
+ match.retrieveMarkers("~:base/s:marker");
if (DEBUG)
log.trace("Retrieve pagebreaks from index");
diff --git a/src/main/java/de/ids_mannheim/korap/index/MultiTerm.java b/src/main/java/de/ids_mannheim/korap/index/MultiTerm.java
index ae49892..d06b00c 100644
--- a/src/main/java/de/ids_mannheim/korap/index/MultiTerm.java
+++ b/src/main/java/de/ids_mannheim/korap/index/MultiTerm.java
@@ -1,10 +1,9 @@
package de.ids_mannheim.korap.index;
-import static de.ids_mannheim.korap.util.KrillArray.*;
import de.ids_mannheim.korap.util.CorpusDataException;
import org.apache.lucene.util.BytesRef;
import java.nio.ByteBuffer;
-import java.util.*;
+import java.nio.charset.StandardCharsets;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -51,7 +50,7 @@
private boolean storeOffsets = false;
public BytesRef payload = null;
- private static ByteBuffer bb = ByteBuffer.allocate(8);
+ private static ByteBuffer bb = ByteBuffer.allocate(64);
private static String[] stringOffset;
private static short i, l;
@@ -403,7 +402,6 @@
try {
for (i = 1; i < pls.length;) {
-
// Resize the bytebuffer
if ((bb.capacity() - l) < 8) {
bb = ByteBuffer.allocate(bb.capacity() + 8)
@@ -428,6 +426,20 @@
bb.putLong(Long.parseLong(pls[i]));
l += 8;
break;
+ case "<x>": // bytes
+
+ byte[] data = pls[i].getBytes(StandardCharsets.UTF_8);
+
+ if ((bb.capacity() - l) < (data.length + 4)) {
+ bb = ByteBuffer.allocate(bb.capacity() + data.length + 4)
+ .put(bb.array());
+ bb.position(l);
+ };
+
+ bb.putInt(data.length);
+ bb.put(data);
+ l += data.length + 4;
+ break;
};
i += 2;
};
diff --git a/src/main/java/de/ids_mannheim/korap/index/MultiTermTokenStream.java b/src/main/java/de/ids_mannheim/korap/index/MultiTermTokenStream.java
index 50e39aa..f777184 100644
--- a/src/main/java/de/ids_mannheim/korap/index/MultiTermTokenStream.java
+++ b/src/main/java/de/ids_mannheim/korap/index/MultiTermTokenStream.java
@@ -56,7 +56,7 @@
private int mttIndex = 0, mtIndex = 0;
private short i = 0;
- private ByteBuffer payload = ByteBuffer.allocate(36);
+ private ByteBuffer payload = ByteBuffer.allocate(512);
/**
diff --git a/src/main/java/de/ids_mannheim/korap/index/TermInfo.java b/src/main/java/de/ids_mannheim/korap/index/TermInfo.java
index efb5535..f6eecdc 100644
--- a/src/main/java/de/ids_mannheim/korap/index/TermInfo.java
+++ b/src/main/java/de/ids_mannheim/korap/index/TermInfo.java
@@ -97,6 +97,12 @@
tterm = tterm.substring(2);
break;
+ case '~':
+ this.type = "mark";
+ ttype = 5;
+ tterm = tterm.substring(2);
+ break;
+
default:
// term
this.type = "term";
@@ -131,6 +137,18 @@
};
}
+ else if (ttype == 5) {
+ pti = this.payload.get(); // Ignore PTI - temporary!!!
+ this.value = tterm;
+
+ // This also means that the startchar may contain
+ // the position (aka page number) of the marker
+ this.startChar = this.payload.getInt();
+ this.endChar = this.payload.getInt();
+
+ // This is interesting, because we may have an <x> here!
+ }
+
// for positions (aka offset tokens)
else {
this.value = tterm;
diff --git a/src/main/java/de/ids_mannheim/korap/response/Match.java b/src/main/java/de/ids_mannheim/korap/response/Match.java
index 5bb6ad1..7d91745 100644
--- a/src/main/java/de/ids_mannheim/korap/response/Match.java
+++ b/src/main/java/de/ids_mannheim/korap/response/Match.java
@@ -13,6 +13,8 @@
import java.util.LinkedList;
import java.util.List;
+import java.nio.charset.StandardCharsets;
+
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermContext;
@@ -92,12 +94,13 @@
// Logger
private final static Logger log = LoggerFactory.getLogger(Match.class);
-
+
// end marker of highlights that are pagebreaks
private static final int PB_MARKER = -99999;
+ private static final int ALL_MARKER = -99998;
// Textual elements that are in context
- private static final int CONTEXT = -99998;
+ private static final int CONTEXT = -99997;
// This advices the java compiler to ignore all loggings
public static final boolean DEBUG = false;
@@ -237,7 +240,7 @@
/**
* Private class of highlights.
* TODO: This should probably be renamed, as it not only contains highlights
- * but also annotations, pagebreaks and relations
+ * but also annotations, markers, pagebreaks and relations
*/
private class Highlight {
public int start, end;
@@ -285,6 +288,19 @@
this.end = PB_MARKER;
this.number = pagenumber;
};
+
+ // Marker
+ public Highlight (int start, String marker) {
+ this.start = start;
+ this.end = ALL_MARKER;
+
+ // TODO: This can overflow!
+ if (annotationNumberCounter < 2048) {
+ this.number = annotationNumberCounter++;
+ annotationNumber.put(this.number, marker);
+ };
+ };
+
};
@@ -513,6 +529,11 @@
this.addHighlight(new Highlight(start, pagenumber));
};
+ public void addMarker (int start, String data) {
+ this.addHighlight(new Highlight(start, data));
+ };
+
+
/**
* Get document id.
*/
@@ -562,7 +583,7 @@
// Iterate over highlights to find matching class
for (Highlight h : this.highlight) {
- if (h.number == number && h.end != PB_MARKER)
+ if (h.number == number && h.end != PB_MARKER && h.end != ALL_MARKER)
return h.start;
};
@@ -734,7 +755,7 @@
// There are highlights to integrate
if (this.highlight != null) {
for (Highlight h : this.highlight) {
- if (h.number >= 256 || h.end == PB_MARKER)
+ if (h.number >= 256 || h.end == PB_MARKER || h.end == ALL_MARKER)
continue;
// Add highlight to the snippet
@@ -767,7 +788,11 @@
@JsonIgnore
public String getPosID (String pos) {
- String[] startEnd = pos.split("-");
+ if (pos == null) {
+ return "";
+ };
+
+ String[] startEnd = pos.split("-");
if (startEnd.length == 2) {
return this.getPosID(
Integer.parseInt(startEnd[0]),
@@ -845,35 +870,35 @@
};
- // Retrieve pagebreaks in a certain area
- public List<int[]> retrievePagebreaks (String pb) {
+ // Retrieve markers in a certain area
+ public List<int[]> retrieveMarkers (String marker) {
if (this.positionsToOffset != null) {
- return this.retrievePagebreaks(
+ return this.retrieveMarkers(
this.positionsToOffset.getLeafReader(),
(Bits) null,
"tokens",
- pb
+ marker
);
};
return null;
};
- // Retrieve pagebreaks in a certain area
+ // Retrieve markers in a certain area
// THIS IS NOT VERY CLEVER - MAKE IT MORE CLEVER!
- public List<int[]> retrievePagebreaks (LeafReaderContext atomic,
+ public List<int[]> retrieveMarkers (LeafReaderContext atomic,
Bits bitset,
String field,
- String pb) {
+ String marker) {
- // List of relevant pagebreaks
+ // List of relevant pagebreaks - only used for pagebreak markers!
List<int[]> pagebreaks = new ArrayList<>(24);
int charOffset = 0, pagenumber = 0, start = 0;
if (DEBUG) {
log.debug("=================================");
- log.debug("Retrieve pagebreaks between {}-{}",
+ log.debug("Retrieve markers between {}-{}",
this.getStartPos(),
this.getEndPos());
};
@@ -881,37 +906,37 @@
try {
// Store character offsets in ByteBuffer
- ByteBuffer bb = ByteBuffer.allocate(16);
+ ByteBuffer bb = ByteBuffer.allocate(256);
- // Store last relevant pagebreak in byte array
+ // Store last relevant marker in byte array
byte[] b = null;
- SpanTermQuery stq = new SpanTermQuery(new Term(field, pb));
+ SpanTermQuery stq = new SpanTermQuery(new Term(field, marker));
if (DEBUG)
- log.trace("Check pagebreaks with {}", stq.toString());
+ log.trace("Check markers with {}", stq.toString());
- Spans pagebreakSpans = stq.getSpans(
+ Spans markerSpans = stq.getSpans(
atomic, bitset, new HashMap<Term, TermContext>()
);
// Iterate over all pagebreaks
- while (pagebreakSpans.next() == true) {
+ while (markerSpans.next() == true) {
if (DEBUG) {
log.debug("There is a pagebreak at {}/{} and we are at {}",
- pagebreakSpans.doc(),
- pagebreakSpans.start(),
+ markerSpans.doc(),
+ markerSpans.start(),
this.localDocID);
};
// Current pagebreak is not in the correct document
- if (pagebreakSpans.doc() != this.localDocID) {
- if (pagebreakSpans.doc() < this.localDocID) {
- pagebreakSpans.skipTo(this.localDocID);
+ if (markerSpans.doc() != this.localDocID) {
+ if (markerSpans.doc() < this.localDocID) {
+ markerSpans.skipTo(this.localDocID);
// No pagebreaks in this document
- if (pagebreakSpans.doc() != this.localDocID)
+ if (markerSpans.doc() != this.localDocID)
break;
}
else {
@@ -921,19 +946,19 @@
};
if (DEBUG)
- log.debug("The pagebreak occurs in the document");
-
- // There is a pagebreak found - check,
+ log.debug("The marker occurs in the document");
+
+ // There is a marker found - check,
// if it is in the correct area
- if (pagebreakSpans.start() <= this.getStartPos()) {
+ if (markerSpans.start() < this.getStartPos()) {
// Only the first payload is relevant
- b = pagebreakSpans.getPayload().iterator().next();
- start = pagebreakSpans.start();
+ b = markerSpans.getPayload().iterator().next();
+ start = markerSpans.start();
if (DEBUG)
- log.debug("PB start position is before match at {}:{}",
- pagebreakSpans.start(),
+ log.debug("Marker start position is before match at {}:{}",
+ markerSpans.start(),
b);
}
@@ -941,48 +966,85 @@
// This is the first pagebreak inside the match!
else {
- // b is already defined!
+ // b is already defined!
+ // This may be due to the last next
if (b != null) {
bb.rewind();
bb.put(b);
bb.rewind();
pagenumber = bb.getInt();
- charOffset = bb.getInt();
+ charOffset = bb.getInt();
- if (DEBUG)
- log.debug("Add pagebreak to list: {}-{}", charOffset, pagenumber);
+ // This marker is a pagebreak
+ if (pagenumber != 0) {
+ if (DEBUG)
+ log.debug("Add pagebreak to list: {}-{}", charOffset, pagenumber);
- // This is the first pagebreak!
- pagebreaks.add(new int[]{charOffset, pagenumber});
+ // This is the first pagebreak!
+ pagebreaks.add(new int[]{charOffset, pagenumber});
- if (start >= this.getStartPos()) {
+ if (start >= this.getStartPos()) {
+ if (DEBUG)
+ log.debug("Add marker to rendering: {}-{}",
+ charOffset,
+ pagenumber);
+ this.addPagebreak(charOffset, pagenumber);
+ };
- if (DEBUG)
- log.debug("Add pagebreak to rendering: {}-{}",
- charOffset,
- pagenumber);
- this.addPagebreak(charOffset, pagenumber);
- };
+ // This marker is no pagebreak
+ } else {
+ int bytelength = bb.getInt();
+ byte[] anno = new byte[bytelength];
+ bb.get(anno, 0, bytelength);
+ String annoStr = new String(anno, StandardCharsets.UTF_8);
+ this.addMarker(charOffset, annoStr);
+ }
+
b = null;
}
// b wasn't used yet
- if (pagebreakSpans.start() <= this.getEndPos()) {
+ if (markerSpans.start() <= this.getEndPos()) {
// Set new pagebreak
// Only the first payload is relevant
- b = pagebreakSpans.getPayload().iterator().next();
+ b = markerSpans.getPayload().iterator().next();
bb.rewind();
bb.put(b);
bb.rewind();
pagenumber = bb.getInt();
charOffset = bb.getInt();
+
+ // This marker is a pagebreak
+ if (pagenumber != 0) {
+ if (DEBUG)
+ log.debug("Add pagebreak to list: {}-{}", charOffset, pagenumber);
- // This is the first pagebreak!
- pagebreaks.add(new int[]{charOffset, pagenumber});
- this.addPagebreak(charOffset,pagenumber);
+ // This is the first pagebreak!
+ pagebreaks.add(new int[]{charOffset, pagenumber});
+
+ if (start >= this.getStartPos()) {
+ if (DEBUG)
+ log.debug("Add pagebreak to rendering: {}-{}",
+ charOffset,
+ pagenumber);
+ this.addPagebreak(charOffset, pagenumber);
+ };
+
+
+ }
+ // This marker is no pagebreak
+ else {
+ int bytelength = bb.getInt();
+
+ byte[] anno = new byte[bytelength];
+ bb.get(anno);
+ String annoStr = new String(anno, StandardCharsets.UTF_8);
+ this.addMarker(charOffset, annoStr);
+ }
+
b = null;
}
@@ -993,6 +1055,7 @@
};
};
+ // That's identical to the above approach and should only occur once
if (b != null) {
bb.rewind();
bb.put(b);
@@ -1001,21 +1064,34 @@
pagenumber = bb.getInt();
charOffset = bb.getInt();
- if (DEBUG)
- log.debug("Add pagebreak to list: {}-{}", charOffset, pagenumber);
-
- // This is a remembered pagebreak!
- pagebreaks.add(new int[]{charOffset, pagenumber});
+ // This marker is a pagebreak
+ if (pagenumber != 0) {
- if (start >= this.getStartPos()) {
-
if (DEBUG)
- log.debug("Add pagebreak to rendering: {}-{}",
- charOffset,
- pagenumber);
- this.addPagebreak(charOffset, pagenumber);
- };
+ log.debug("Add pagebreak to list: {}-{}", charOffset, pagenumber);
+
+ // This is a remembered pagebreak!
+ pagebreaks.add(new int[]{charOffset, pagenumber});
+ if (start >= this.getStartPos()) {
+
+ if (DEBUG)
+ log.debug("Add pagebreak to rendering: {}-{}",
+ charOffset,
+ pagenumber);
+ this.addPagebreak(charOffset, pagenumber);
+ };
+ }
+ // This marker is no pagebreak
+ else {
+ int bytelength = bb.getInt();
+
+ byte[] anno = new byte[bytelength];
+ bb.get(anno);
+ String annoStr = new String(anno, StandardCharsets.UTF_8);
+ this.addMarker(charOffset, annoStr);
+ }
+
b = null;
};
}
@@ -1218,7 +1294,7 @@
&& hl.end <= this.getEndPos()) {
// Highlight is no pagebreak
- if (hl.end != PB_MARKER) {
+ if (hl.end != PB_MARKER && hl.end != ALL_MARKER) {
pto.add(this.localDocID, hl.start);
pto.add(this.localDocID, hl.end);
@@ -1381,13 +1457,19 @@
snippetArray.addClose(element[2]);
}
- // empty tag
+ // empty tag (pagebreak)
else if (element[3] == 2) {
// Add Empty (pagebreak)
snippetArray.addEmpty(element[2]);
- }
-
+ }
+
+ // empty tag (marker)
+ else if (element[3] == 3) {
+
+ // Add Empty (pagebreak)
+ snippetArray.addMarker(element[2]);
+ }
// open tag
else {
@@ -1555,7 +1637,7 @@
continue;
// Highlight is a pagebreak
- if (highlight.end == PB_MARKER)
+ if (highlight.end == PB_MARKER || highlight.end == ALL_MARKER)
continue;
if (classes == null)
@@ -1769,7 +1851,9 @@
if (DEBUG)
log.debug("No more open tags -- close all non pagebreaks");
- if (closeList.peekFirst()[1] != PB_MARKER) {
+ int pf = closeList.peekFirst()[1];
+
+ if (pf != PB_MARKER && pf != ALL_MARKER) {
stack.add(closeList.removeFirst());
}
else if (DEBUG) {
@@ -1785,26 +1869,34 @@
break;
};
- // Closener is pagebreak
- if (closeList.peekFirst()[1] == PB_MARKER) {
+ int clpf = closeList.peekFirst()[1];
+ int olpf = openList.peekFirst()[1];
+
+ // Closener is pagebreak or marker
+ if (clpf == PB_MARKER || clpf == ALL_MARKER) {
if (DEBUG)
- log.debug("Close is pagebreak -- ignore (2)");
+ log.debug("Close is pagebreak or a marker -- ignore (2)");
// Remove closing pagebreak
closeList.removeFirst();
}
- // Opener is pagebreak
- else if (openList.peekFirst()[1] == PB_MARKER) {
+ // Opener is pagebreak or marker
+ else if (olpf == PB_MARKER || olpf == ALL_MARKER) {
int[] e = openList.removeFirst().clone();
if (DEBUG)
- log.debug("Open is pagebreak");
+ log.debug("Open is pagebreak or a marker");
// Mark as empty
e[1] = e[0]; // Remove pagebreak marker
- e[3] = 2;
+
+ if (olpf == PB_MARKER) {
+ e[3] = 2;
+ } else {
+ e[3] = 3;
+ };
// Add empty pagebreak
stack.add(e);
@@ -1958,7 +2050,7 @@
int end = -1;
// Highlight is a pagebreak
- if (highlight.end != PB_MARKER) {
+ if (highlight.end != PB_MARKER && highlight.end != ALL_MARKER) {
start = this.positionsToOffset.start(ldid, highlight.start);
end = this.positionsToOffset.end(ldid, highlight.end);
}
@@ -1980,14 +2072,14 @@
start -= startOffsetChar;
// Keep end equal -1
- if (end != PB_MARKER) {
+ if (end != PB_MARKER && end != ALL_MARKER) {
end -= startOffsetChar;
}
else if (DEBUG) {
log.debug("Pagebreak keeps end position");
};
- if (start < 0 || (end < 0 && end != PB_MARKER))
+ if (start < 0 || (end < 0 && end != PB_MARKER && end != ALL_MARKER))
continue;
// Create intArray for highlight
diff --git a/src/main/java/de/ids_mannheim/korap/response/match/HighlightCombinator.java b/src/main/java/de/ids_mannheim/korap/response/match/HighlightCombinator.java
index 36360c5..0a72156 100644
--- a/src/main/java/de/ids_mannheim/korap/response/match/HighlightCombinator.java
+++ b/src/main/java/de/ids_mannheim/korap/response/match/HighlightCombinator.java
@@ -83,6 +83,11 @@
this.combine.add(new HighlightCombinatorElement((byte) 3, pagenumber));
};
+ // Add marker highlight to the stack
+ public void addMarker (int annonumber) {
+ this.combine.add(new HighlightCombinatorElement((byte) 4, annonumber));
+ };
+
// Add closing highlight combinator to the stack
public void addClose (int number) {
diff --git a/src/main/java/de/ids_mannheim/korap/response/match/HighlightCombinatorElement.java b/src/main/java/de/ids_mannheim/korap/response/match/HighlightCombinatorElement.java
index c39825a..d72d46b 100644
--- a/src/main/java/de/ids_mannheim/korap/response/match/HighlightCombinatorElement.java
+++ b/src/main/java/de/ids_mannheim/korap/response/match/HighlightCombinatorElement.java
@@ -2,10 +2,8 @@
import org.apache.lucene.util.FixedBitSet;
import de.ids_mannheim.korap.response.Match;
-import de.ids_mannheim.korap.response.match.Relation;
import static de.ids_mannheim.korap.util.KrillString.*;
import java.util.*;
-import java.io.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -16,13 +14,14 @@
public class HighlightCombinatorElement {
// Number -1: Match
- // Number -99998: Context
- private final static int CONTEXT = -99998;
+ // Number -99997: Context
+ private final static int CONTEXT = -99997;
// Type 0: Textual data
// Type 1: Opening
// Type 2: Closing
- // Type 3: Empty
+ // Type 3: Empty (pagebreak)
+ // Type 4: Empty (marker)
public byte type;
public int number = 0;
@@ -168,6 +167,12 @@
// Empty element
else if (this.type == 3) {
return "<span class=\"pb\" data-after=\"" + number + "\"></span>";
+ }
+
+ // Marker
+ else if (this.type == 4) {
+ String[] parts = match.getAnnotationID(this.number).split(":", 2);
+ return "<span class=\"inline-marker\" data-key=\"" + escapeHTML(parts[0]) + "\" data-value=\"" + escapeHTML(parts[1]) + "\"></span>";
};
// HTML encode primary data
diff --git a/src/test/java/de/ids_mannheim/korap/index/TestPagebreakIndex.java b/src/test/java/de/ids_mannheim/korap/index/TestPagebreakIndex.java
index 93d7925..4492313 100644
--- a/src/test/java/de/ids_mannheim/korap/index/TestPagebreakIndex.java
+++ b/src/test/java/de/ids_mannheim/korap/index/TestPagebreakIndex.java
@@ -63,11 +63,13 @@
Result kr = ki.search(sq, (short) 10);
assertEquals(4, kr.getMatches().size());
+ // Doc 0
assertEquals(2, kr.getMatch(0).getStartPos());
assertEquals(3, kr.getMatch(0).getEndPos());
assertEquals(-1, kr.getMatch(0).getStartPage());
assertEquals(-1, kr.getMatch(0).getEndPage());
+ // Doc 1
assertEquals(2, kr.getMatch(1).getStartPos());
assertEquals(3, kr.getMatch(1).getEndPos());
assertEquals(528, kr.getMatch(1).getStartPage());
@@ -75,8 +77,10 @@
assertEquals(5, kr.getMatch(2).getStartPos());
assertEquals(6, kr.getMatch(2).getEndPos());
- assertEquals(529, kr.getMatch(2).getStartPage());
- assertEquals(-1, kr.getMatch(2).getEndPage());
+ assertEquals(528, kr.getMatch(2).getStartPage());
+ assertEquals("<span class=\"context-left\">abcab</span><span class=\"match\"><mark>c</mark></span><span class=\"context-right\">abac</span>",
+ kr.getMatch(2).getSnippetHTML());
+ assertEquals(529, kr.getMatch(2).getEndPage()); // Debatable
assertEquals(9, kr.getMatch(3).getStartPos());
assertEquals(10, kr.getMatch(3).getEndPos());
diff --git a/src/test/java/de/ids_mannheim/korap/search/TestKrill.java b/src/test/java/de/ids_mannheim/korap/search/TestKrill.java
index 4db8b15..ec38fa4 100644
--- a/src/test/java/de/ids_mannheim/korap/search/TestKrill.java
+++ b/src/test/java/de/ids_mannheim/korap/search/TestKrill.java
@@ -740,6 +740,29 @@
assertEquals(529, res.at("/pages/0").asInt());
};
+ @Test
+ public void searchJSONwithUtteranceAttributes () throws IOException {
+ // Construct index
+ KrillIndex ki = new KrillIndex();
+ // Indexing test files
+ FieldDocument fd = ki.addDoc(1,
+ getClass().getResourceAsStream("/others/kokokom-example.json.gz"), true);
+ ki.commit();
+
+ assertEquals(fd.getUID(), 1);
+ assertEquals(fd.getTextSigle(), "KTC/001/000001");
+
+ Krill ks = new Krill(new QueryBuilder("tokens").seg("s:Räuspern"));
+ Result kr = ks.apply(ki);
+
+ assertEquals(1, kr.getTotalResults());
+ assertEquals(0, kr.getStartIndex());
+ assertEquals(25, kr.getItemsPerPage());
+ Match m = kr.getMatch(0);
+ assertEquals("<span class=\"context-left\"></span><span class=\"match\"><span class=\"inline-marker\" data-key=\"who\" data-value=\"Mai Thi Nguyen-Kim\"></span><span class=\"inline-marker\" data-key=\"start\" data-value=\"0:00\"></span><span class=\"inline-marker\" data-key=\"end\" data-value=\"01:20\"></span>(<mark>Räuspern</mark></span><span class=\"context-right\">) Wie viele Geschlechter gibt es? Wenn<span class=\"more\"></span></span>", m.getSnippetHTML());
+ };
+
+
@Test
public void searchJSONnewJSON2 () throws IOException {
@@ -1463,7 +1486,7 @@
assertEquals(2, kr.getMatch(0).getStartPos());
assertEquals(52, kr.getMatch(0).getEndPos());
assertEquals(kr.getMatch(0).getSnippetBrackets(),
- "Maximen und [[Reflexionen Religion und Christentum. wir sind naturforschend Pantheisten, dichtend Polytheisten, sittlich Monotheisten. Gott, wenn wir hoch stehen, ist alles; stehen wir niedrig, so ist er ein Supplement unsrer Armseligkeit. die Kreatur ist sehr schwach; denn sucht sie etwas, findet sie's nicht. stark aber ist Gott; denn sucht er die Kreatur]<!>], so hat er sie gleich in ...");
+ "Maximen und [[Reflexionen Religion und Christentum. wir sind naturforschend Pantheisten, dichtend Polytheisten, sittlich Monotheisten. Gott, wenn wir hoch stehen, ist alles; stehen wir niedrig, so ist er ein Supplement unsrer Armseligkeit. die Kreatur ist sehr schwach; denn sucht sie etwas, findet sie's nicht. stark aber ist Gott; denn sucht er die Kreatur]<!>], so hat er sie gleich in ...");
assertEquals(kr.getMatch(0).getSnippetHTML(),
"<span class=\"context-left\">Maximen und </span><span class=\"match\"><mark>Reflexionen Religion und Christentum. wir sind naturforschend Pantheisten, dichtend Polytheisten, sittlich Monotheisten. Gott, wenn wir hoch stehen, ist alles; stehen wir niedrig, so ist er ein Supplement unsrer Armseligkeit. die Kreatur ist sehr schwach; denn sucht sie etwas, findet sie's nicht. stark aber ist Gott; denn sucht er die Kreatur</mark><span class=\"cutted\"></span></span><span class=\"context-right\">, so hat er sie gleich in<span class=\"more\"></span></span>");
assertEquals(kr.getMatch(0).getTextSigle(), "GOE_AGX.00002");
diff --git a/src/test/resources/others/kokokom-example.json.gz b/src/test/resources/others/kokokom-example.json.gz
new file mode 100644
index 0000000..d0ed313
--- /dev/null
+++ b/src/test/resources/others/kokokom-example.json.gz
Binary files differ