Fixed context expansion bug for new corpora

Change-Id: Id56324f94ef9a85815b38479f289f57c7ffd58b6
diff --git a/Changes b/Changes
index 1bf1d78..6959658 100644
--- a/Changes
+++ b/Changes
@@ -1,4 +1,4 @@
-0.52 2015-07-02
+0.52 2015-07-08
         - [bugfix] Fixed payload filtering in FocusSpans (margaretha)
 	- [workaround] Reintroduced empty collection support,
 	  as Koral still creates them (diewald)
@@ -26,6 +26,7 @@
 	- [feature] Support '@all' as a 'fields' value for all meta
 	  data fields (diewald)
 	- [bugfix] Fix case sensitivity bug in filters (diewald)
+	- [bugfix] Fix sentence expansion bug for new data (diewald)
 
 0.51 2015-03-17
         - This is a major version (prepared for the GitHub release)
diff --git a/Errorcodes b/Errorcodes
index bb412dd..62b3e2f 100644
--- a/Errorcodes
+++ b/Errorcodes
@@ -7,6 +7,7 @@
 610: "Missing request parameters"
 620: "Unable to generate JSON"
 621: "Unable to parse JSON"
+651: "Unable to extend context"
 680: "Server is up and running!"
 681: "Document was added successfully", document id
 682: "Response time exceeded"
diff --git a/src/main/java/de/ids_mannheim/korap/KrillIndex.java b/src/main/java/de/ids_mannheim/korap/KrillIndex.java
index 02597d1..73a604a 100644
--- a/src/main/java/de/ids_mannheim/korap/KrillIndex.java
+++ b/src/main/java/de/ids_mannheim/korap/KrillIndex.java
@@ -824,13 +824,14 @@
                     BooleanClause.Occur.MUST);
         }
 
-        // LEGACY
+        // <legacy>
         else {
             bool.add(new TermQuery(new Term("ID", match.getDocID())),
                     BooleanClause.Occur.MUST);
             bool.add(new TermQuery(new Term("corpusID", match.getCorpusID())),
                     BooleanClause.Occur.MUST);
         };
+        // </legacy>
 
         Filter filter = (Filter) new QueryWrapperFilter(bool);
 
@@ -965,18 +966,35 @@
                 match.setLocalDocID(localDocID);
                 match.populateDocument(doc, field, fields);
                 if (DEBUG)
-                    log.trace("The document has the id '{}'", match.getDocID());
+                    log.trace("The document has the id '{}' or the sigle '{}'",
+                              match.getDocID(),
+                              match.getTextSigle());
 
                 // Todo:
                 SearchContext context = match.getContext();
 
                 // Search for minimal surrounding sentences
                 if (extendToSentence) {
-                    int[] spanContext = match.expandContextToSpan("s");
-                    match.setStartPos(spanContext[0]);
-                    match.setEndPos(spanContext[1]);
-                    match.startMore = false;
-                    match.endMore = false;
+
+                    String element = (match.getTextSigle() == null ? "s" : "base/s:s");
+
+                    // SUPPORT FOR LEGACY ANNOTATIONS
+                    int[] spanContext = match.expandContextToSpan(element);
+
+                    if (DEBUG)
+                        log.trace("Extend to sentence element '{}'", element);
+
+                    // </legacy>
+                    if (spanContext[0] >= 0 &&
+                        spanContext[0] < spanContext[1]) {
+                        match.setStartPos(spanContext[0]);
+                        match.setEndPos(spanContext[1]);
+                        match.startMore = false;
+                        match.endMore = false;
+                    }
+                    else {
+                        match.addWarning(651, "Unable to extend context");
+                    };
                 }
                 else {
                     if (DEBUG)
diff --git a/src/main/java/de/ids_mannheim/korap/response/match/DocIdentifier.java b/src/main/java/de/ids_mannheim/korap/response/match/DocIdentifier.java
index 660a831..b384222 100644
--- a/src/main/java/de/ids_mannheim/korap/response/match/DocIdentifier.java
+++ b/src/main/java/de/ids_mannheim/korap/response/match/DocIdentifier.java
@@ -7,8 +7,9 @@
 // TODO: This should only use textSigle!
 
 public class DocIdentifier {
-    protected String textSigle, corpusID, // LEGACY
-            docID;    // LEGACY
+    protected String textSigle, // fine
+        corpusID, // LEGACY
+        docID;    // LEGACY
 
 
     // Legacy
diff --git a/src/main/java/de/ids_mannheim/korap/response/match/MatchIdentifier.java b/src/main/java/de/ids_mannheim/korap/response/match/MatchIdentifier.java
index aee9851..f9fdfbe 100644
--- a/src/main/java/de/ids_mannheim/korap/response/match/MatchIdentifier.java
+++ b/src/main/java/de/ids_mannheim/korap/response/match/MatchIdentifier.java
@@ -10,9 +10,10 @@
 
     // TODO: "contains" is necessary for a compatibility bug in Kustvakt
     Pattern idRegex = Pattern
-            .compile("^(?:match-|contains-)(?:([^!]+?)[!\\.])?"
-                    + "([^!]+)-p([0-9]+)-([0-9]+)"
-                    + "((?:\\(-?[0-9]+\\)-?[0-9]+--?[0-9]+)*)" + "(?:c.+?)?$");
+        .compile("^(?:match-|contains-)"
+                 + "(?:([^!]+?)[!\\.])?"
+                 + "([^!]+)-p([0-9]+)-([0-9]+)"
+                 + "((?:\\(-?[0-9]+\\)-?[0-9]+--?[0-9]+)*)" + "(?:c.+?)?$");
     Pattern posRegex = Pattern.compile("\\(([0-9]+)\\)([0-9]+)-([0-9]+)");
 
 
diff --git a/src/test/java/de/ids_mannheim/korap/index/TestMatchIdentifier.java b/src/test/java/de/ids_mannheim/korap/index/TestMatchIdentifier.java
index 71926fd..49f9d8c 100644
--- a/src/test/java/de/ids_mannheim/korap/index/TestMatchIdentifier.java
+++ b/src/test/java/de/ids_mannheim/korap/index/TestMatchIdentifier.java
@@ -78,9 +78,22 @@
 
         id = new MatchIdentifier(
                 "match-c1!d1-p4-20(5)7-8(-2)9-10(2)3-4(3)-5-6(4)7-8(5)9--10");
+        assertEquals(4, id.getStartPos());
+        assertEquals(20, id.getEndPos());
+        assertEquals("c1", id.getCorpusID());
+        assertEquals("d1", id.getDocID());
+        assertEquals(null, id.getTextSigle());
         assertEquals(id.toString(), "match-c1!d1-p4-20(5)7-8(2)3-4(4)7-8");
-    };
 
+        id = new MatchIdentifier(
+                "match-GOE!GOE_AGF.02286-p2105-2106");
+        assertEquals(2105, id.getStartPos());
+        assertEquals(2106, id.getEndPos());
+        assertEquals(null, id.getCorpusID());
+        assertEquals(null, id.getDocID());
+        assertEquals("GOE_AGF.02286", id.getTextSigle());
+        assertEquals("match-GOE_AGF.02286-p2105-2106", id.toString());
+    };
 
     @Test
     public void posIdentifierExample1 () throws IOException {
@@ -242,6 +255,29 @@
                 km.getSnippetHTML());
     };
 
+    @Test
+    public void indexNewStructure () throws IOException, QueryException {
+        KrillIndex ki = new KrillIndex();
+        ki.addDoc(getClass().getResourceAsStream("/goe/AGX-00002.json"), false);
+        ki.commit();
+
+        Match km = ki.getMatchInfo(
+                                   "match-GOE!GOE_AGX.00002-p210-211",
+                                   "tokens",
+                                   true,
+                                   (String) null,
+                                   (String) null,
+                                   true,
+                                   true,
+                                   true
+                                   );
+
+        JsonNode res = mapper.readTree(km.toJsonString());
+        assertEquals("tokens", res.at("/field").asText());
+        assertEquals("GOE_AGX.00002", res.at("/textSigle").asText());
+        assertEquals("Goethe, Johann Wolfgang von", res.at("/author").asText());
+    };
+
 
     @Test
     public void indexExample5Spans () throws IOException, QueryException {