Add token serialization as a snippet option (fixes #72)

Change-Id: Ibc175000d162ceba5de79b11ef0d70d031e20e9d
diff --git a/.gitignore b/.gitignore
index 5cce705..613b6da 100644
--- a/.gitignore
+++ b/.gitignore
@@ -15,6 +15,7 @@
 /wiki.org
 /test-output
 /misc/web-api.md
+*.code-workspace
 *~
 .*
 *\#
diff --git a/Changes b/Changes
index 607d58d..5724b9c 100644
--- a/Changes
+++ b/Changes
@@ -1,7 +1,10 @@
-0.59.7 2021-11-08
-    - Implemented a new cache with on disk storage and auto-update (margaretha).
+0.59.7 2021-12-01
+    - [feature] Implemented a new cache with on disk
+      storage and auto-update (margaretha).
+    - [feature] Support for tokenized snippet output
+      (fixed #72; diewald)
 
-0.59.6 2021-11-10
+0.59.6 
     - [bugfix] Fixed skipping of focus spans (fixed #78; margaretha,
       diewald)
     - [bugfix] Clear matchlist if skip fails in NextSpans
diff --git a/src/main/java/de/ids_mannheim/korap/KrillIndex.java b/src/main/java/de/ids_mannheim/korap/KrillIndex.java
index 4e9ea06..fcec1e1 100644
--- a/src/main/java/de/ids_mannheim/korap/KrillIndex.java
+++ b/src/main/java/de/ids_mannheim/korap/KrillIndex.java
@@ -963,6 +963,10 @@
         if (match.getStartPos() == -1)
             return match;
 
+        // For the moment, direct match retrievals will always include
+        // snippets. But this may change in the future.
+        match.hasSnippet = true;
+        
         // Create a filter based on the corpusID and the docID
         BooleanQuery bool = new BooleanQuery();
         if (match.getTextSigle() != null) {
@@ -1319,13 +1323,16 @@
      * Search in the index.
      */
     public Result search (SpanQuery query) {
-        return this.search(new Krill(query));
+        final Krill krill = new Krill(query);
+        krill.getMeta().setSnippets(true);
+        return this.search(krill);
     };
 
 
     public Result search (SpanQuery query, short count) {
         final Krill krill = new Krill(query);
         krill.getMeta().setCount(count);
+        krill.getMeta().setSnippets(true);
         return this.search(krill);
     };
 
@@ -1340,6 +1347,7 @@
         meta.setStartIndex(startIndex).setCount(count);
         meta.setContext(new SearchContext(leftTokenContext, leftContext,
                 rightTokenContext, rightContext));
+        meta.setSnippets(true);
         return this.search(ks);
     };
 
@@ -1379,7 +1387,7 @@
         // The following fields should be lifted for matches
         List<String> fields = (ArrayList<String>) meta.getFields().clone();
         HashSet<String> fieldsSet = new HashSet<String>(fields);
-        boolean snippets = meta.hasSnippets();
+        boolean snippets = meta.hasSnippets() || meta.hasTokens();
 
         // Lift all fields
         if (fields.contains("@all")) {
@@ -1447,7 +1455,6 @@
 
                 if (isTimeout)
                     break;
-
                 
                 /*
                  * Todo: There may be a way to know early if the bitset is emty
@@ -1525,7 +1532,7 @@
                     // Create new Match
                     final Match match = new Match(pto, localDocID,
                             spans.start(), spans.end());
-
+                    
                     // Add snippet if existing
                     if (snippets) {
                         match.setContext(kr.getContext());
@@ -1536,7 +1543,15 @@
 
                         if (spans.isPayloadAvailable())
                             match.addPayload((List<byte[]>) spans.getPayload());
-                    }
+                        
+                        if (meta.hasSnippets()) {
+                            match.hasSnippet = true;
+                        };
+                        
+                        if (meta.hasTokens()) {
+                            match.hasTokens = true;
+                        };
+                    };
 
                     // Add match to Result
                     kr.add(match);
diff --git a/src/main/java/de/ids_mannheim/korap/KrillMeta.java b/src/main/java/de/ids_mannheim/korap/KrillMeta.java
index eddafd8..dbd814f 100644
--- a/src/main/java/de/ids_mannheim/korap/KrillMeta.java
+++ b/src/main/java/de/ids_mannheim/korap/KrillMeta.java
@@ -29,6 +29,8 @@
     // Per default snippets are requested
     private boolean snippets = true;
 
+    private boolean tokens = false;
+
     private ArrayList<String> fields;
     HashSet<Integer> highlights;
 
@@ -144,10 +146,15 @@
         if (json.has("itemsPerResource"))
             this.setItemsPerResource(json.get("itemsPerResource").asInt());
 
-        // Defined context
+        // Defined snippets
         if (json.has("snippets")) {
             this.snippets = json.get("snippets").asBoolean();
         };
+
+        // Defined tokens
+        if (json.has("tokens")) {
+            this.tokens = json.get("tokens").asBoolean();
+        };
         
         // Defined context
         if (json.has("context"))
@@ -284,6 +291,23 @@
         this.snippets = snippets;
         return this;
     };
+
+
+    /**
+     * Get if tokens should be retrieved.
+     */
+    public boolean hasTokens () {
+        return this.tokens;
+    };
+
+
+    /**
+     * Set if tokens should be retrieved.
+     */
+    public KrillMeta setTokens (boolean tokens) {
+        this.tokens = tokens;
+        return this;
+    };
     
 
     // Get set of fields
diff --git a/src/main/java/de/ids_mannheim/korap/response/Match.java b/src/main/java/de/ids_mannheim/korap/response/Match.java
index 6f46fca..8c1498e 100644
--- a/src/main/java/de/ids_mannheim/korap/response/Match.java
+++ b/src/main/java/de/ids_mannheim/korap/response/Match.java
@@ -139,8 +139,13 @@
 		identifier,
 		mirrorIdentifier;
 
+    private ObjectNode snippetTokens;
+    
     private HighlightCombinator snippetArray;
 
+    public boolean hasSnippet = false;
+    public boolean hasTokens = false;
+
     
     @JsonIgnore
     public boolean startMore = true, endMore = true;
@@ -1122,6 +1127,7 @@
         this.processed = false;
         this.snippetHTML = null;
         this.snippetBrackets = null;
+        this.snippetTokens = null;
 		this.identifier = null;
 
         // Delete all spans
@@ -1351,8 +1357,177 @@
         };
     };
 
+    /*
+     * Return the snippet as a list of tokens
+     */
+    @JsonIgnore
+    public ObjectNode getSnippetTokens () {
+        ObjectNode json = mapper.createObjectNode();
 
-    @JsonProperty("snippet")
+        if (this.processed && this.snippetTokens != null)
+            return this.snippetTokens;
+        
+        if (DEBUG)
+            log.trace("--- Process tokens");
+                    
+        if (this.positionsToOffset == null || this.localDocID == -1)
+            return null;
+
+        PositionsToOffset pto = this.positionsToOffset;
+        int ldid = this.localDocID;
+
+        int startContext = -1;
+        int endContext = -1;
+        int startContextChar = -1;
+        int endContextChar = -1;
+
+        int pdl = this.getPrimaryDataLength();
+        
+        // Get context based on a span definition
+        if (this.getContext().isSpanDefined()) {
+
+            if (DEBUG)
+                log.debug("Context defined by span");
+            
+            int[] spanContext = this.expandContextToSpan(
+                this.positionsToOffset.getLeafReader(), (Bits) null,
+                "tokens", this.context.getSpanContext());
+            startContext = spanContext[0];
+            endContext = spanContext[1];
+            startContextChar = spanContext[2];
+            endContextChar = spanContext[3];
+        }
+
+        // The offset is not yet defined - and defined by tokens
+        if (endContext == -1) {
+
+            if (DEBUG)
+                log.debug("No context defined by span");
+
+            if (this.context.left.isToken() && this.context.left.getLength() > 0) {
+                startContext = this.startPos - this.context.left.getLength();
+                if (startContext < 0)
+                    startContext = 0;
+            };
+        
+            if (this.context.right.isToken() && this.context.right.getLength() > 0) {
+                endContext = this.endPos + this.context.right.getLength() - 1;
+            };
+        };
+       
+        if (startContext == -1) {
+            startContext = this.startPos;
+            if (DEBUG)
+                log.debug("Set startContext {}", endContext);
+        };
+
+        if (endContext == -1) {
+            endContext = this.endPos - 1;
+            if (DEBUG)
+                log.debug("Set endContext {}", endContext);
+        };
+        
+        // Retrieve the character offsets for all tokens
+        for (int i = startContext; i < endContext; i++) {
+            pto.add(ldid, i);
+        };
+
+        if (startContextChar == -1)
+            startContextChar = pto.start(ldid, startContext);
+
+        if (endContextChar == -1)
+            endContextChar = pto.end(ldid, endContext);
+            
+        if (DEBUG)
+            log.debug("Match is {}/{} - {}/{}",startContext,startContextChar,endContext,endContextChar);
+
+        if (endContextChar == -1 || endContextChar == 0 || endContextChar > pdl) {
+            this.tempSnippet = this.getPrimaryData(startContextChar);
+            this.endMore = false;
+        } else  {
+            this.tempSnippet = this.getPrimaryData(startContextChar,endContextChar);
+        }
+
+        if (startContext == 0) {
+            this.startMore = false;
+        }
+        
+        Integer[] offsets;
+        ArrayNode tokens;
+        int i;
+
+        // Create left context token list
+        if (startContext < this.startPos) {
+            tokens = json.putArray("left");
+            for (i = startContext; i < this.startPos; i++) {
+                offsets = pto.span(ldid,i);
+                tokens.add(
+                    this.tempSnippet.substring(
+                        offsets[0]- startContextChar, offsets[1] - startContextChar)
+                    );
+            };
+        };
+
+        tokens = json.putArray("match");
+        for (i = this.startPos; i < this.endPos; i++) {
+            offsets = pto.span(ldid,i);
+            if (offsets == null) {
+                continue;
+            }
+            tokens.add(
+                this.tempSnippet.substring(
+                    offsets[0]- startContextChar, offsets[1] - startContextChar)
+                );
+        };
+
+        // Create right context token list
+        if (endContext > this.endPos) {
+            tokens = null;
+            for (i = this.endPos; i < endContext; i++) {
+                offsets = pto.span(ldid,i);
+                if (offsets == null) {
+                    break;
+                };
+
+                if (tokens == null)
+                    tokens = json.putArray("right");
+                
+                tokens.add(
+                    this.tempSnippet.substring(
+                        offsets[0]- startContextChar, offsets[1] - startContextChar)
+                    );
+            };
+        };
+
+        // Add class arrays to JSON
+        if (this.highlight != null) {
+
+            ArrayNode classes = null;
+            for (Highlight highlight : this.highlight) {
+
+                if (highlight.number < 0 || highlight.number > 255)
+                    continue;
+
+                // Highlight is a pagebreak
+                if (highlight.end == PB_MARKER)
+                    continue;
+
+                if (classes == null)
+                    classes = json.putArray("classes");
+                
+                ArrayNode cls = mapper.createArrayNode();                
+                cls.add(highlight.number);
+                cls.add(highlight.start - this.startPos);
+                cls.add(highlight.end - this.startPos);
+                classes.add(cls);
+            };
+        };
+
+        return (this.snippetTokens = json);
+    };
+    
+
+    @JsonIgnore
     public String getSnippetHTML () {
 
         if (!this._processHighlight())
@@ -1642,7 +1817,7 @@
      * Sometimes the match start and end positions are inside the
      * matching region, e.g. when the match was expanded.
      * This will override the original matching positions
-     * And matrk the real matching.
+     * And mark the real matching.
      */
     public void overrideMatchPosition (int start, int end) {
         if (DEBUG)
@@ -1893,7 +2068,7 @@
         };
 
         if (DEBUG)
-            log.trace("Snippet: '" + this.tempSnippet + "'");
+            log.trace("Snippet: '{}'", this.tempSnippet);
 
         if (DEBUG)
             log.trace(
@@ -1926,6 +2101,12 @@
 
 			json.put("pages", pages);
 		};
+        
+        if (this.hasSnippet)
+            json.put("snippet", this.getSnippetHTML());
+
+        if (this.hasTokens)
+            json.put("tokens", this.getSnippetTokens());
 
         return json;
     };
diff --git a/src/main/java/de/ids_mannheim/korap/response/Result.java b/src/main/java/de/ids_mannheim/korap/response/Result.java
index 32b3cfe..873f137 100644
--- a/src/main/java/de/ids_mannheim/korap/response/Result.java
+++ b/src/main/java/de/ids_mannheim/korap/response/Result.java
@@ -274,8 +274,16 @@
         this._addMeta(json);
 
         // Add matches
-        if (this.matches != null)
-            json.putPOJO("matches", this.getMatches());
+        if (this.matches != null) {
+
+            // Initiate matches
+            ArrayNode matches = json.putArray("matches");
+
+            // Add matches
+            for (Match km : this.getMatches()) {
+                matches.add(km.toJsonNode());
+            };
+        };
 
         return json;
     };
diff --git a/src/test/java/de/ids_mannheim/korap/highlight/TestHighlight.java b/src/test/java/de/ids_mannheim/korap/highlight/TestHighlight.java
index 666fd73..27f03c9 100644
--- a/src/test/java/de/ids_mannheim/korap/highlight/TestHighlight.java
+++ b/src/test/java/de/ids_mannheim/korap/highlight/TestHighlight.java
@@ -533,4 +533,45 @@
 			km.getSnippetHTML());
 
 	};
+
+        @Test
+        public void checkTokenArray () throws IOException, QueryException {
+    
+            KrillIndex ki = new KrillIndex();
+            String json = new String("{" + "  \"fields\" : [" + "    { "
+                    + "      \"primaryData\" : \"abc\"" + "    }," + "    {"
+                    + "      \"name\" : \"tokens\"," + "      \"data\" : ["
+                    + "         [ \"s:a\", \"i:a\", \"_0#0-1\", \"-:t$<i>3\"],"
+                    + "         [ \"s:b\", \"i:b\", \"_1#1-2\" ],"
+                    + "         [ \"s:c\", \"i:c\", \"_2#2-3\" ]" + "      ]"
+                    + "    }" + "  ]" + "}");
+    
+            ki.addDoc(json);
+            ki.commit();
+    
+            QueryBuilder kq = new QueryBuilder("tokens");
+            Result kr = ki
+                    .search((SpanQuery) kq.seq(kq.nr(1, kq.seg("s:b")), kq.seg("s:c")).toQuery());
+            Match km = kr.getMatch(0);
+            assertEquals(km.getStartPos(), 1);
+            assertEquals(km.getEndPos(), 3);
+            assertEquals(km.getStartPos(1), 1);
+            assertEquals(km.getEndPos(1), 2);
+            
+            assertEquals(
+                     "{\"left\":[\"a\"],\"match\":[\"b\",\"c\"],\"classes\":[[1,0,0]]}",
+            km.getSnippetTokens().toString());                    
+
+            kq = new QueryBuilder("tokens");
+            kr = ki
+                .search((SpanQuery) kq.seq(kq.seg("s:a"), kq.seg("s:b"), kq.seg("s:c")).toQuery());
+            km = kr.getMatch(0);
+            assertEquals(km.getStartPos(), 0);
+            assertEquals(km.getEndPos(), 3);
+            
+            assertEquals(
+                     "{\"match\":[\"a\",\"b\",\"c\"]}",
+            km.getSnippetTokens().toString());                    
+    
+        };
 };
diff --git a/src/test/java/de/ids_mannheim/korap/index/TestMatchIdentifier.java b/src/test/java/de/ids_mannheim/korap/index/TestMatchIdentifier.java
index 1d8f3cd..ade2efd 100644
--- a/src/test/java/de/ids_mannheim/korap/index/TestMatchIdentifier.java
+++ b/src/test/java/de/ids_mannheim/korap/index/TestMatchIdentifier.java
@@ -153,6 +153,8 @@
 
         assertEquals("SnippetBrackets (0)", "... bcabca[[{2:b{1:a}}]]c",
                 km.getSnippetBrackets());
+        assertEquals("SnippetTokens (0)", "{\"left\":[\"b\",\"c\",\"a\",\"b\",\"c\",\"a\"],\"match\":[\"b\",\"a\"],\"right\":[\"c\"],\"classes\":[[2,0,1],[1,1,1]]}",
+                     km.getSnippetTokens().toString());
         assertEquals("ID (0)", "match-c1!d1-p7-9(2)7-8(1)8-8", km.getID());
     };
 
@@ -170,6 +172,8 @@
 
         assertEquals("SnippetBrackets (0)", "... [[{2:b{a}}]] ...",
                 km.getSnippetBrackets());
+        assertEquals("SnippetTokens (0)", "{\"match\":[\"b\",\"a\"],\"classes\":[[0,1,1],[2,0,1]]}",
+                     km.getSnippetTokens().toString());
 
         assertEquals("ID (0)", "match-c1!d1-p7-9(0)8-8(2)7-8", km.getID());
 
@@ -179,6 +183,8 @@
         assertEquals("SnippetBrackets (1)",
                 "... [[{f/m:acht:b}{f/m:neun:a}]] ...",
                 km.getSnippetBrackets());
+        assertEquals("SnippetTokens (1)", "{\"match\":[\"b\",\"a\"]}",
+                     km.getSnippetTokens().toString());
 
 		// Mirror identifier when passed
         km = ki.getMatchInfo("match-c1!d1-p7-9(0)8-8(2)7-8", "tokens", "f",
@@ -186,6 +192,8 @@
         assertEquals("SnippetBrackets (1b)",
                 "... [[{f/m:acht:{f/y:eight:b}}{f/m:neun:{f/y:nine:a}}]] ...",
                 km.getSnippetBrackets());
+        assertEquals("SnippetTokens (1b)", "{\"match\":[\"b\",\"a\"]}",
+                     km.getSnippetTokens().toString());
 
         JsonNode res = mapper.readTree(km.toJsonString());
         assertEquals("match-c1!d1-p7-9(0)8-8(2)7-8",
@@ -197,6 +205,8 @@
         assertEquals("SnippetBrackets (2)",
                 "... [[{2:{f/m:acht:b}{{f/m:neun:a}}}]] ...",
                 km.getSnippetBrackets());
+        assertEquals("SnippetTokens (2)", "{\"match\":[\"b\",\"a\"],\"classes\":[[0,1,1],[2,0,1]]}",
+                     km.getSnippetTokens().toString());
 
         km = ki.getMatchInfo("match-c1!d1-p7-9(4)8-8(2)7-8", "tokens", "f", "m",
                 false, true);
@@ -204,6 +214,8 @@
         assertEquals("SnippetBrackets (3)",
                 "... [[{2:{f/m:acht:b}{4:{f/m:neun:a}}}]] ...",
                 km.getSnippetBrackets());
+        assertEquals("SnippetTokens (3)", "{\"match\":[\"b\",\"a\"],\"classes\":[[4,1,1],[2,0,1]]}",
+                     km.getSnippetTokens().toString());
 
         km = ki.getMatchInfo("match-c1!d1-p7-9(4)8-8(2)7-8", "tokens", "f",
                 null, false, true);
@@ -211,6 +223,8 @@
         assertEquals("SnippetBrackets (4)",
                 "... [[{2:{f/m:acht:{f/y:eight:b}}{4:{f/m:neun:{f/y:nine:a}}}}]] ...",
                 km.getSnippetBrackets());
+        assertEquals("SnippetTokens (4)", "{\"match\":[\"b\",\"a\"],\"classes\":[[4,1,1],[2,0,1]]}",
+                     km.getSnippetTokens().toString());
 
         assertEquals("SnippetHTML (4)",
                 "<span class=\"context-left\">" + "<span class=\"more\">"
@@ -1012,7 +1026,11 @@
 		assertEquals("SnippetBrackets (with Spans)",
 					 "[[{x/tag:a:{x/tag:b:{x/tag:c:{x/tag:v:x}}y}}z]]",
 					 km.getSnippetBrackets());
-		
+
+		assertEquals("SnippetTokens (without Spans)",
+					 "{\"match\":[\"x\",\"y\",\"z\"]}",
+					 km.getSnippetTokens().toString());
+        
         assertEquals(fd.getTextSigle(), "GOE/AGA/03828");
         assertEquals(fd.getFieldValue("title"), "Autobiographische Einzelheiten");
 
diff --git a/src/test/java/de/ids_mannheim/korap/search/TestKrill.java b/src/test/java/de/ids_mannheim/korap/search/TestKrill.java
index 32b665c..84d8636 100644
--- a/src/test/java/de/ids_mannheim/korap/search/TestKrill.java
+++ b/src/test/java/de/ids_mannheim/korap/search/TestKrill.java
@@ -102,6 +102,7 @@
         meta.setStartIndex(5);
         meta.getContext().left.setLength(1);
         meta.getContext().right.setLength(1);
+        assertTrue(meta.hasSnippets());
 
         Result kr = ks.apply(ki);
         assertEquals(kr.getTotalResults(), 6);
@@ -110,20 +111,49 @@
                 "... dem [[Buchstaben]] A ...");
 
         JsonNode res = ks.toJsonNode();
+        
         assertEquals(3, res.at("/meta/count").asInt());
         assertEquals(5, res.at("/meta/startIndex").asInt());
         assertEquals("token", res.at("/meta/context/left/0").asText());
         assertEquals(1, res.at("/meta/context/left/1").asInt());
         assertEquals("token", res.at("/meta/context/right/0").asText());
         assertEquals(1, res.at("/meta/context/right/1").asInt());
+        assertTrue(res.at("/matches/0/snippet").isMissingNode());
+        assertTrue(res.at("/matches/0/tokens").isMissingNode());
+
+        res = kr.toJsonNode();
+
+        assertFalse(res.at("/matches/0/snippet").isMissingNode());
+        assertTrue(res.at("/matches/0/tokens").isMissingNode());
+
 
         // Handle count=0 correctly
         meta = ks.getMeta();
         meta.setCount(0);
+
         kr = ks.apply(ki);
         assertEquals(kr.getTotalResults(), 6);
         assertEquals(kr.getItemsPerPage(), 0);
         assertEquals(kr.getMatches().size(), 0);
+
+        // Handle tokens=true and
+        // snippet=false correctly
+        meta = ks.getMeta();
+        meta.setCount(1);
+        meta.setTokens(true);
+        meta.setSnippets(false);
+
+        kr = ks.apply(ki);
+        assertEquals(kr.getTotalResults(), 6);
+        assertEquals(kr.getMatches().size(), 1);
+
+        res = kr.toJsonNode();
+
+        assertFalse(res.at("/matches/0/hasSnippet").asBoolean());
+        assertTrue(res.at("/matches/0/hasTokens").asBoolean());
+        assertTrue(res.at("/matches/0/snippet").isMissingNode());
+        assertEquals("dem", res.at("/matches/0/tokens/left/0").asText());
+        assertEquals("Buchstaben", res.at("/matches/0/tokens/match/0").asText());
     };
 
 
diff --git a/src/test/java/de/ids_mannheim/korap/search/TestMetaFields.java b/src/test/java/de/ids_mannheim/korap/search/TestMetaFields.java
index 5e6bce8..2042175 100644
--- a/src/test/java/de/ids_mannheim/korap/search/TestMetaFields.java
+++ b/src/test/java/de/ids_mannheim/korap/search/TestMetaFields.java
@@ -239,7 +239,7 @@
         assertTrue(resultJson.indexOf("\"textSigle\":\"GOE_AGX.00002\"") > 0);
         assertTrue(resultJson.indexOf("\"docSigle\":\"GOE_AGX\"") > 0);
         assertTrue(resultJson.indexOf("\"corpusSigle\":\"GOE\"") > 0);
-        assertTrue(resultJson.indexOf("\"UID\":") > 0);
+        // assertTrue(resultJson.indexOf("\"UID\":") > 0);
         assertTrue(resultJson.indexOf("\"availability\":") > 0);
         
         assertEquals(