Fixed context expansion bug for new corpora
Change-Id: Id56324f94ef9a85815b38479f289f57c7ffd58b6
diff --git a/Changes b/Changes
index 1bf1d78..6959658 100644
--- a/Changes
+++ b/Changes
@@ -1,4 +1,4 @@
-0.52 2015-07-02
+0.52 2015-07-08
- [bugfix] Fixed payload filtering in FocusSpans (margaretha)
- [workaround] Reintroduced empty collection support,
as Koral still creates them (diewald)
@@ -26,6 +26,7 @@
- [feature] Support '@all' as a 'fields' value for all meta
data fields (diewald)
- [bugfix] Fix case sensitivity bug in filters (diewald)
+ - [bugfix] Fix sentence expansion bug for new data (diewald)
0.51 2015-03-17
- This is a major version (prepared for the GitHub release)
diff --git a/Errorcodes b/Errorcodes
index bb412dd..62b3e2f 100644
--- a/Errorcodes
+++ b/Errorcodes
@@ -7,6 +7,7 @@
610: "Missing request parameters"
620: "Unable to generate JSON"
621: "Unable to parse JSON"
+651: "Unable to extend context"
680: "Server is up and running!"
681: "Document was added successfully", document id
682: "Response time exceeded"
diff --git a/src/main/java/de/ids_mannheim/korap/KrillIndex.java b/src/main/java/de/ids_mannheim/korap/KrillIndex.java
index 02597d1..73a604a 100644
--- a/src/main/java/de/ids_mannheim/korap/KrillIndex.java
+++ b/src/main/java/de/ids_mannheim/korap/KrillIndex.java
@@ -824,13 +824,14 @@
BooleanClause.Occur.MUST);
}
- // LEGACY
+ // <legacy>
else {
bool.add(new TermQuery(new Term("ID", match.getDocID())),
BooleanClause.Occur.MUST);
bool.add(new TermQuery(new Term("corpusID", match.getCorpusID())),
BooleanClause.Occur.MUST);
};
+ // </legacy>
Filter filter = (Filter) new QueryWrapperFilter(bool);
@@ -965,18 +966,35 @@
match.setLocalDocID(localDocID);
match.populateDocument(doc, field, fields);
if (DEBUG)
- log.trace("The document has the id '{}'", match.getDocID());
+ log.trace("The document has the id '{}' or the sigle '{}'",
+ match.getDocID(),
+ match.getTextSigle());
// Todo:
SearchContext context = match.getContext();
// Search for minimal surrounding sentences
if (extendToSentence) {
- int[] spanContext = match.expandContextToSpan("s");
- match.setStartPos(spanContext[0]);
- match.setEndPos(spanContext[1]);
- match.startMore = false;
- match.endMore = false;
+
+ String element = (match.getTextSigle() == null ? "s" : "base/s:s");
+
+ // SUPPORT FOR LEGACY ANNOTATIONS
+ int[] spanContext = match.expandContextToSpan(element);
+
+ if (DEBUG)
+ log.trace("Extend to sentence element '{}'", element);
+
+ // </legacy>
+ if (spanContext[0] >= 0 &&
+ spanContext[0] < spanContext[1]) {
+ match.setStartPos(spanContext[0]);
+ match.setEndPos(spanContext[1]);
+ match.startMore = false;
+ match.endMore = false;
+ }
+ else {
+ match.addWarning(651, "Unable to extend context");
+ };
}
else {
if (DEBUG)
diff --git a/src/main/java/de/ids_mannheim/korap/response/match/DocIdentifier.java b/src/main/java/de/ids_mannheim/korap/response/match/DocIdentifier.java
index 660a831..b384222 100644
--- a/src/main/java/de/ids_mannheim/korap/response/match/DocIdentifier.java
+++ b/src/main/java/de/ids_mannheim/korap/response/match/DocIdentifier.java
@@ -7,8 +7,9 @@
// TODO: This should only use textSigle!
public class DocIdentifier {
- protected String textSigle, corpusID, // LEGACY
- docID; // LEGACY
+ protected String textSigle, // fine
+ corpusID, // LEGACY
+ docID; // LEGACY
// Legacy
diff --git a/src/main/java/de/ids_mannheim/korap/response/match/MatchIdentifier.java b/src/main/java/de/ids_mannheim/korap/response/match/MatchIdentifier.java
index aee9851..f9fdfbe 100644
--- a/src/main/java/de/ids_mannheim/korap/response/match/MatchIdentifier.java
+++ b/src/main/java/de/ids_mannheim/korap/response/match/MatchIdentifier.java
@@ -10,9 +10,10 @@
// TODO: "contains" is necessary for a compatibility bug in Kustvakt
Pattern idRegex = Pattern
- .compile("^(?:match-|contains-)(?:([^!]+?)[!\\.])?"
- + "([^!]+)-p([0-9]+)-([0-9]+)"
- + "((?:\\(-?[0-9]+\\)-?[0-9]+--?[0-9]+)*)" + "(?:c.+?)?$");
+ .compile("^(?:match-|contains-)"
+ + "(?:([^!]+?)[!\\.])?"
+ + "([^!]+)-p([0-9]+)-([0-9]+)"
+ + "((?:\\(-?[0-9]+\\)-?[0-9]+--?[0-9]+)*)" + "(?:c.+?)?$");
Pattern posRegex = Pattern.compile("\\(([0-9]+)\\)([0-9]+)-([0-9]+)");
diff --git a/src/test/java/de/ids_mannheim/korap/index/TestMatchIdentifier.java b/src/test/java/de/ids_mannheim/korap/index/TestMatchIdentifier.java
index 71926fd..49f9d8c 100644
--- a/src/test/java/de/ids_mannheim/korap/index/TestMatchIdentifier.java
+++ b/src/test/java/de/ids_mannheim/korap/index/TestMatchIdentifier.java
@@ -78,9 +78,22 @@
id = new MatchIdentifier(
"match-c1!d1-p4-20(5)7-8(-2)9-10(2)3-4(3)-5-6(4)7-8(5)9--10");
+ assertEquals(4, id.getStartPos());
+ assertEquals(20, id.getEndPos());
+ assertEquals("c1", id.getCorpusID());
+ assertEquals("d1", id.getDocID());
+ assertEquals(null, id.getTextSigle());
assertEquals(id.toString(), "match-c1!d1-p4-20(5)7-8(2)3-4(4)7-8");
- };
+ id = new MatchIdentifier(
+ "match-GOE!GOE_AGF.02286-p2105-2106");
+ assertEquals(2105, id.getStartPos());
+ assertEquals(2106, id.getEndPos());
+ assertEquals(null, id.getCorpusID());
+ assertEquals(null, id.getDocID());
+ assertEquals("GOE_AGF.02286", id.getTextSigle());
+ assertEquals("match-GOE_AGF.02286-p2105-2106", id.toString());
+ };
@Test
public void posIdentifierExample1 () throws IOException {
@@ -242,6 +255,29 @@
km.getSnippetHTML());
};
+ @Test
+ public void indexNewStructure () throws IOException, QueryException {
+ KrillIndex ki = new KrillIndex();
+ ki.addDoc(getClass().getResourceAsStream("/goe/AGX-00002.json"), false);
+ ki.commit();
+
+ Match km = ki.getMatchInfo(
+ "match-GOE!GOE_AGX.00002-p210-211",
+ "tokens",
+ true,
+ (String) null,
+ (String) null,
+ true,
+ true,
+ true
+ );
+
+ JsonNode res = mapper.readTree(km.toJsonString());
+ assertEquals("tokens", res.at("/field").asText());
+ assertEquals("GOE_AGX.00002", res.at("/textSigle").asText());
+ assertEquals("Goethe, Johann Wolfgang von", res.at("/author").asText());
+ };
+
@Test
public void indexExample5Spans () throws IOException, QueryException {