Added sentence expansion for match info
diff --git a/CHANGES b/CHANGES
index 0d8e3fc..8d4f839 100644
--- a/CHANGES
+++ b/CHANGES
@@ -1,3 +1,6 @@
+0.30.1 2014-02-13
+ - Added sentence extension for match info (diewald)
+
0.30 2014-02-13
- This is a major version (prepared for the IDS meeting on the 17th of february)
- Improved stringification for distance queries (margaretha)
diff --git a/pom.xml b/pom.xml
index cb77ece..03d4809 100644
--- a/pom.xml
+++ b/pom.xml
@@ -11,7 +11,7 @@
-->
<groupId>KorAP-modules</groupId>
<artifactId>KorAP-lucene-index</artifactId>
- <version>0.30</version>
+ <version>0.30.1</version>
<packaging>jar</packaging>
<name>KorAP-lucene-index</name>
diff --git a/src/main/java/de/ids_mannheim/korap/KorapIndex.java b/src/main/java/de/ids_mannheim/korap/KorapIndex.java
index 34f4962..d2825dc 100644
--- a/src/main/java/de/ids_mannheim/korap/KorapIndex.java
+++ b/src/main/java/de/ids_mannheim/korap/KorapIndex.java
@@ -75,6 +75,7 @@
import de.ids_mannheim.korap.index.TermInfo;
import de.ids_mannheim.korap.index.SpanInfo;
import de.ids_mannheim.korap.index.MatchIdentifier;
+import de.ids_mannheim.korap.query.SpanElementQuery;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -461,7 +462,7 @@
public KorapMatch getMatch (String id) {
- return this.getMatchInfo(id, "tokens", false, null, null, false, true);
+ return this.getMatchInfo(id, "tokens", false, null, null, false, true, false);
};
public KorapMatch getMatchInfo (String id,
@@ -470,7 +471,17 @@
String layer,
boolean includeSpans,
boolean includeHighlights) {
- return this.getMatchInfo(id, field, true, foundry, layer, includeSpans, includeHighlights);
+ return this.getMatchInfo(id, field, true, foundry, layer, includeSpans, includeHighlights, false);
+ };
+
+ public KorapMatch getMatchInfo (String id,
+ String field,
+ String foundry,
+ String layer,
+ boolean includeSpans,
+ boolean includeHighlights,
+ boolean extendToSentence) {
+ return this.getMatchInfo(id, field, true, foundry, layer, includeSpans, includeHighlights, extendToSentence);
};
/**
@@ -487,7 +498,8 @@
String foundry,
String layer,
boolean includeSpans,
- boolean includeHighlights) {
+ boolean includeHighlights,
+ boolean extendToSentence) {
KorapMatch match = new KorapMatch(idString, includeHighlights);
@@ -577,6 +589,47 @@
if (!info) break;
+ // Search for minimal surrounding sentences
+ if (extendToSentence) {
+
+ SpanElementQuery squery = new SpanElementQuery(field, "s");
+ Spans sentence = squery.getSpans(atomic,
+ (Bits) bitset,
+ new HashMap<Term, TermContext>());
+
+ log.trace("Now search for {}", sentence.toString());
+
+ int newStart = -1, newEnd = -1;
+
+ while (true) {
+
+ // Game over
+ if (sentence.next() != true)
+ break;
+
+ // There's an s found, that starts before the match
+ if (sentence.start() <= match.getStartPos()) {
+ newStart = sentence.start() > newStart ? sentence.start() : newStart;
+ }
+ else if (newStart == -1)
+ break;
+
+ // There's an s found, that ends after the match
+ if (sentence.end() >= match.getEndPos()) {
+ newEnd = sentence.end();
+ break;
+ };
+ };
+
+ // We have a new match surrounding
+ if (newStart > -1 && newEnd > -1) {
+ log.trace("New match spans from {}-{}", newStart, newEnd);
+ match.setStartPos(newStart);
+ match.setEndPos(newEnd);
+ };
+ };
+
+
// Limit the terms to all the terms of interest
TermsEnum termsEnum = docTerms.intersect(fst, null);
diff --git a/src/test/java/de/ids_mannheim/korap/index/TestFieldDocument.java b/src/test/java/de/ids_mannheim/korap/index/TestFieldDocument.java
index 51308d1..6c80e4c 100644
--- a/src/test/java/de/ids_mannheim/korap/index/TestFieldDocument.java
+++ b/src/test/java/de/ids_mannheim/korap/index/TestFieldDocument.java
@@ -208,6 +208,9 @@
assertEquals("... Orte in [Norwegen]: Å i ...", kr.match(2).getSnippetBrackets());
assertEquals("WPD_AAA.00005", kr.match(2).getDocID());
+ /*
+ System.err.println(ki.getMatchInfo(kr.match(2).getID(), "tokens", "xip", "l", true, false).getSnippetHTML());
+ */
query = kq.seg("tt/l:Vokal").without("mate/m:number:sg").toQuery();
kr = ki.search(query, 0, (short) 5, true, (short) 2, false, (short) 5);
diff --git a/src/test/java/de/ids_mannheim/korap/index/TestMatchIdentifier.java b/src/test/java/de/ids_mannheim/korap/index/TestMatchIdentifier.java
index ffbf04e..b556c6f 100644
--- a/src/test/java/de/ids_mannheim/korap/index/TestMatchIdentifier.java
+++ b/src/test/java/de/ids_mannheim/korap/index/TestMatchIdentifier.java
@@ -479,6 +479,33 @@
};
+ @Test
+ public void indexExample7SentenceExpansion () throws IOException {
+ KorapIndex ki = new KorapIndex();
+ ki.addDoc(createSimpleFieldDoc());
+ ki.commit();
+
+ KorapMatch km = ki.getMatchInfo("match-c1!d1-p3-4",
+ "tokens",
+ null,
+ null,
+ false,
+ false);
+
+ assertEquals("... [{f/m:vier:{f/y:four:{it/is:4:{x/o:viertens:a}}}}] ...",
+ km.getSnippetBrackets());
+
+ km = ki.getMatchInfo("match-c1!d1-p3-4",
+ "tokens",
+ null,
+ null,
+ false,
+ false,
+ true);
+
+ assertEquals("... [{f/m:drei:{f/y:three:{it/is:3:{x/o:drittens:c}}}}{f/m:vier:{f/y:four:{it/is:4:{x/o:viertens:a}}}}{f/m:fuenf:{f/y:five:{it/is:5:{x/o:fünftens:b}}}}] ...",
+ km.getSnippetBrackets());
+ };
private FieldDocument createSimpleFieldDoc(){
FieldDocument fd = new FieldDocument();
@@ -488,7 +515,7 @@
"abcabcabac",
"[(0-1)s:a|i:a|f/m:eins|f/y:one|x/o:erstens|it/is:1|>:x/rel:a$<i>4|_0#0-1|-:t$<i>10]" +
"[(1-2)s:b|i:b|f/m:zwei|f/y:two|x/o:zweitens|it/is:2|_1#1-2]" +
- "[(2-3)s:c|i:c|f/m:drei|f/y:three|x/o:drittens|it/is:3|_2#2-3]" +
+ "[(2-3)s:c|i:c|f/m:drei|f/y:three|x/o:drittens|it/is:3|_2#2-3|<>:s#2-5$<i>5]" +
"[(3-4)s:a|i:a|f/m:vier|f/y:four|x/o:viertens|it/is:4|<:x/rel:b$<i>1|_3#3-4]" +
"[(4-5)s:b|i:b|f/m:fuenf|f/y:five|x/o:fünftens|it/is:5|_4#4-5]" +
"[(5-6)s:c|i:c|f/m:sechs|f/y:six|x/o:sechstens|it/is:6|_5#5-6]" +
diff --git a/src/test/java/de/ids_mannheim/korap/query/TestKorapQueryJSON.java b/src/test/java/de/ids_mannheim/korap/query/TestKorapQueryJSON.java
index 979dacd..780a397 100644
--- a/src/test/java/de/ids_mannheim/korap/query/TestKorapQueryJSON.java
+++ b/src/test/java/de/ids_mannheim/korap/query/TestKorapQueryJSON.java
@@ -200,11 +200,19 @@
public void queryJSONcosmas4 () {
SpanQueryWrapperInterface sqwi = jsonQuery(getClass().getResource("/queries/cosmas4.json").getFile());
- // "das /+w1:3,s1 Buch"
+ // "das /+w1:3,s1:1 Buch"
assertEquals(sqwi.toQuery().toString(), "spanMultipleDistance(tokens:s:das, tokens:s:Buch, [(w[1:3], ordered, notExcluded), (s[1:1], ordered, notExcluded)])");
};
@Test
+ public void queryJSONcosmas4b () {
+ SpanQueryWrapperInterface sqwi = jsonQuery(getClass().getResource("/queries/cosmas4b.json").getFile());
+
+ // "das /+w1:3,s1 Buch"
+ assertEquals(sqwi.toQuery().toString(), "spanMultipleDistance(tokens:s:das, tokens:s:Buch, [(w[1:3], ordered, notExcluded), (s[0:1], ordered, notExcluded)])");
+ };
+
+ @Test
public void queryJSONcosmas10 () {
SpanQueryWrapperInterface sqwi = jsonQuery(getClass().getResource("/queries/cosmas10.json").getFile());
diff --git a/src/test/resources/queries/readme.txt b/src/test/resources/queries/readme.txt
index 2aa2f2e..c5b3d63 100644
--- a/src/test/resources/queries/readme.txt
+++ b/src/test/resources/queries/readme.txt
@@ -20,7 +20,7 @@
// Based on KorAP-querySerialization/examples/
cosmas3: "das /+w1:3 Buch" # word-distance constraint
-cosmas4: "das /+w1:3,s1 Buch" # combined word-distance and sent-distance constraint
+cosmas4: "das /+w1:3,s1:1 Buch" # combined word-distance and sent-distance constraint
cosmas10: "Institut für $deutsche Sprache" # finds both
cosmas16: "$wegen #IN(L) <s>" # finds 'wegen' at beginning of sentence, also when capitalised
cosmas17: "#BED($wegen , +sa)" # equivalent to above