blob: 1c23494d23d043e91d505767fbec568eabd953b0 [file] [log] [blame]
package de.ids_mannheim.korap.index;
import java.util.*;
import java.io.*;
import org.apache.lucene.util.Version;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.Bits;
import static org.junit.Assert.*;
import org.junit.Test;
import org.junit.Ignore;
import org.junit.runner.RunWith;
import org.junit.runners.JUnit4;
import de.ids_mannheim.korap.KorapIndex;
import de.ids_mannheim.korap.KorapQuery;
import de.ids_mannheim.korap.KorapResult;
import de.ids_mannheim.korap.KorapSearch;
import de.ids_mannheim.korap.index.FieldDocument;
import de.ids_mannheim.korap.analysis.MultiTermTokenStream;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.search.spans.SpanTermQuery;
@RunWith(JUnit4.class)
public class TestRegexWildcardIndex {
@Test
public void indexRegex () throws Exception {
KorapIndex ki = new KorapIndex();
// abcabcabac
FieldDocument fd = new FieldDocument();
fd.addTV("base",
"affe afffe baum baumgarten steingarten franz hans haus efeu effe",
"[(0-4)s:affe|_0#0-4|-:t$<i>10]" +
"[(5-10)s:afffe|_1#5-10]" +
"[(11-15)s:baum|_2#11-15]" +
"[(16-26)s:baumgarten|_3#16-26]" +
"[(27-38)s:steingarten|_4#27-38]" +
"[(39-44)s:franz|_5#39-44]" +
"[(45-49)s:hans|_6#45-49]" +
"[(50-54)s:haus|_7#50-54]" +
"[(55-59)s:efeu|_8#55-59]" +
"[(60-64)s:effe|_9#60-64]");
ki.addDoc(fd);
ki.commit();
KorapQuery kq = new KorapQuery("base");
SpanQuery sq = kq.re("s:af*e").toQuery();
assertEquals("SpanMultiTermQueryWrapper(base:/s:af*e/)", sq.toString());
KorapSearch ks = new KorapSearch(sq);
ks.context.left.setToken(true).setLength(1);
ks.context.right.setToken(true).setLength(1);
KorapResult kr = ki.search(ks);
assertEquals(2, kr.getTotalResults());
assertEquals("[affe] afffe ...", kr.getMatch(0).getSnippetBrackets());
assertEquals("affe [afffe] baum ...", kr.getMatch(1).getSnippetBrackets());
kr = ki.search(ks.setQuery(new KorapQuery("base").re("s:baum.*").toQuery()));
assertEquals(2, kr.getTotalResults());
assertEquals("... afffe [baum] baumgarten ...", kr.getMatch(0).getSnippetBrackets());
assertEquals("... baum [baumgarten] steingarten ...", kr.getMatch(1).getSnippetBrackets());
kr = ki.search(ks.setQuery(new KorapQuery("base").re("s:.....?garten").toQuery()));
assertEquals(2, kr.getTotalResults());
assertEquals("... baum [baumgarten] steingarten ...", kr.getMatch(0).getSnippetBrackets());
assertEquals("... baumgarten [steingarten] franz ...", kr.getMatch(1).getSnippetBrackets());
kr = ki.search(ks.setQuery(new KorapQuery("base").re("s:ha.s").toQuery()));
assertEquals(2, kr.getTotalResults());
assertEquals("... franz [hans] haus ...", kr.getMatch(0).getSnippetBrackets());
assertEquals("... hans [haus] efeu ...", kr.getMatch(1).getSnippetBrackets());
kr = ki.search(ks.setQuery(new KorapQuery("base").re("s:.*ff.*").toQuery()));
assertEquals(3, kr.getTotalResults());
assertEquals("[affe] afffe ...", kr.getMatch(0).getSnippetBrackets());
assertEquals("affe [afffe] baum ...", kr.getMatch(1).getSnippetBrackets());
assertEquals("... efeu [effe]", kr.getMatch(2).getSnippetBrackets());
};
@Test
public void indexWildcard () throws Exception {
KorapIndex ki = new KorapIndex();
// abcabcabac
FieldDocument fd = new FieldDocument();
fd.addTV("base",
"affe afffe baum baumgarten steingarten franz hans haus efeu effe",
"[(0-4)s:affe|_0#0-4|-:t$<i>10]" +
"[(5-10)s:afffe|_1#5-10]" +
"[(11-15)s:baum|_2#11-15]" +
"[(16-26)s:baumgarten|_3#16-26]" +
"[(27-38)s:steingarten|_4#27-38]" +
"[(39-44)s:franz|_5#39-44]" +
"[(45-49)s:hans|_6#45-49]" +
"[(50-54)s:haus|_7#50-54]" +
"[(55-59)s:efeu|_8#55-59]" +
"[(60-64)s:effe|_9#60-64]");
ki.addDoc(fd);
ki.commit();
KorapQuery kq = new KorapQuery("base");
SpanQuery sq = kq.wc("s:af*e").toQuery();
assertEquals("SpanMultiTermQueryWrapper(base:s:af*e)", sq.toString());
KorapSearch ks = new KorapSearch(sq);
ks.context.left.setToken(true).setLength(1);
ks.context.right.setToken(true).setLength(1);
KorapResult kr = ki.search(ks);
assertEquals(2, kr.getTotalResults());
assertEquals("[affe] afffe ...", kr.getMatch(0).getSnippetBrackets());
assertEquals("affe [afffe] baum ...", kr.getMatch(1).getSnippetBrackets());
kr = ki.search(ks.setQuery(new KorapQuery("base").wc("s:baum.*").toQuery()));
assertEquals(0, kr.getTotalResults());
kr = ki.search(ks.setQuery(new KorapQuery("base").wc("s:baum*").toQuery()));
assertEquals(2, kr.getTotalResults());
assertEquals("... afffe [baum] baumgarten ...", kr.getMatch(0).getSnippetBrackets());
assertEquals("... baum [baumgarten] steingarten ...", kr.getMatch(1).getSnippetBrackets());
kr = ki.search(ks.setQuery(new KorapQuery("base").wc("s:*garten").toQuery()));
assertEquals(2, kr.getTotalResults());
assertEquals("... baum [baumgarten] steingarten ...", kr.getMatch(0).getSnippetBrackets());
assertEquals("... baumgarten [steingarten] franz ...", kr.getMatch(1).getSnippetBrackets());
kr = ki.search(ks.setQuery(new KorapQuery("base").wc("s:ha?s").toQuery()));
assertEquals(2, kr.getTotalResults());
assertEquals("... franz [hans] haus ...", kr.getMatch(0).getSnippetBrackets());
assertEquals("... hans [haus] efeu ...", kr.getMatch(1).getSnippetBrackets());
kr = ki.search(ks.setQuery(new KorapQuery("base").wc("s:?ff?").toQuery()));
assertEquals(2, kr.getTotalResults());
assertEquals("[affe] afffe ...", kr.getMatch(0).getSnippetBrackets());
assertEquals("... efeu [effe]", kr.getMatch(1).getSnippetBrackets());
};
@Test
public void indexRegexCaseInsensitive () throws Exception {
KorapIndex ki = new KorapIndex();
// abcabcabac
FieldDocument fd = new FieldDocument();
fd.addTV("base",
"AfFe aFfFE Baum Baumgarten SteinGarten franZ HaNs Haus Efeu effe",
"[(0-4)s:AfFe|i:affe|_0#0-4|-:t$<i>10]" +
"[(5-10)s:aFfFE|i:afffe|_1#5-10]" +
"[(11-15)s:Baum|i:baum|_2#11-15]" +
"[(16-26)s:Baumgarten|i:baumgarten|_3#16-26]" +
"[(27-38)s:SteinGarten|i:steingarten|_4#27-38]" +
"[(39-44)s:franZ|i:franz|_5#39-44]" +
"[(45-49)s:HaNs|i:hans|_6#45-49]" +
"[(50-54)s:Haus|i:haus|_7#50-54]" +
"[(55-59)s:Efeu|i:efeu|_8#55-59]" +
"[(60-64)s:effe|i:effe|_9#60-64]");
ki.addDoc(fd);
ki.commit();
KorapQuery kq = new KorapQuery("base");
SpanQuery sq = kq.re("s:Af*e", true).toQuery();
assertEquals("SpanMultiTermQueryWrapper(base:/i:af*e/)", sq.toString());
KorapSearch ks = new KorapSearch(sq);
ks.context.left.setToken(true).setLength(1);
ks.context.right.setToken(true).setLength(1);
KorapResult kr = ki.search(ks);
assertEquals(2, kr.getTotalResults());
assertEquals("[AfFe] aFfFE ...", kr.getMatch(0).getSnippetBrackets());
assertEquals("AfFe [aFfFE] Baum ...", kr.getMatch(1).getSnippetBrackets());
kr = ki.search(ks.setQuery(new KorapQuery("base").re("s:Af.*e").toQuery()));
assertEquals(1, kr.getTotalResults());
assertEquals("[AfFe] aFfFE ...", kr.getMatch(0).getSnippetBrackets());
kr = ki.search(ks.setQuery(new KorapQuery("base").re("s:baum.*", true).toQuery()));
assertEquals(2, kr.getTotalResults());
assertEquals("... aFfFE [Baum] Baumgarten ...", kr.getMatch(0).getSnippetBrackets());
assertEquals("... Baum [Baumgarten] SteinGarten ...", kr.getMatch(1).getSnippetBrackets());
kr = ki.search(ks.setQuery(new KorapQuery("base").re("s:.*garten", true).toQuery()));
assertEquals(2, kr.getTotalResults());
assertEquals("... Baum [Baumgarten] SteinGarten ...", kr.getMatch(0).getSnippetBrackets());
assertEquals("... Baumgarten [SteinGarten] franZ ...", kr.getMatch(1).getSnippetBrackets());
kr = ki.search(ks.setQuery(new KorapQuery("base").re("s:.*garten", false).toQuery()));
assertEquals(1, kr.getTotalResults());
assertEquals("... Baum [Baumgarten] SteinGarten ...", kr.getMatch(0).getSnippetBrackets());
kr = ki.search(ks.setQuery(new KorapQuery("base").re("s:ha.s", true).toQuery()));
assertEquals(2, kr.getTotalResults());
assertEquals("... franZ [HaNs] Haus ...", kr.getMatch(0).getSnippetBrackets());
assertEquals("... HaNs [Haus] Efeu ...", kr.getMatch(1).getSnippetBrackets());
kr = ki.search(ks.setQuery(new KorapQuery("base").re("s:.*f*e", true).toQuery()));
assertEquals(3, kr.getTotalResults());
assertEquals("[AfFe] aFfFE ...", kr.getMatch(0).getSnippetBrackets());
assertEquals("AfFe [aFfFE] Baum ...", kr.getMatch(1).getSnippetBrackets());
assertEquals("... Efeu [effe]", kr.getMatch(2).getSnippetBrackets());
};
@Test
public void indexRegexCombined () throws Exception {
KorapIndex ki = new KorapIndex();
// abcabcabac
FieldDocument fd = new FieldDocument();
fd.addTV("base",
"affe afffe baum baumgarten steingarten franz hans haus efeu effe",
"[(0-4)s:affe|_0#0-4|-:t$<i>10]" +
"[(5-10)s:afffe|_1#5-10]" +
"[(11-15)s:baum|_2#11-15]" +
"[(16-26)s:baumgarten|_3#16-26]" +
"[(27-38)s:steingarten|_4#27-38]" +
"[(39-44)s:franz|_5#39-44]" +
"[(45-49)s:hans|_6#45-49]" +
"[(50-54)s:haus|_7#50-54]" +
"[(55-59)s:efeu|_8#55-59]" +
"[(60-64)s:effe|_9#60-64]");
ki.addDoc(fd);
ki.commit();
KorapQuery kq = new KorapQuery("base");
SpanQuery sq = kq.seq(kq.seg("s:affe")).append(kq.re("s:af*e")).toQuery();
assertEquals("spanNext(base:s:affe, SpanMultiTermQueryWrapper(base:/s:af*e/))", sq.toString());
KorapSearch ks = new KorapSearch(sq);
ks.context.left.setToken(true).setLength(1);
ks.context.right.setToken(true).setLength(1);
KorapResult kr = ki.search(ks);
assertEquals(1, kr.getTotalResults());
assertEquals("[affe afffe] baum ...", kr.getMatch(0).getSnippetBrackets());
};
@Test
public void indexRegexWithinRewrite () throws Exception {
KorapIndex ki = new KorapIndex();
// abcabcabac
FieldDocument fd = new FieldDocument();
fd.addTV("base",
"affe afffe baum baumgarten steingarten franz hans haus efeu effe",
"[(0-4)s:affe|_0#0-4|-:t$<i>10]" +
"[(5-10)s:afffe|_1#5-10]" +
"[(11-15)s:baum|_2#11-15]" +
"[(16-26)s:baumgarten|_3#16-26]" +
"[(27-38)s:steingarten|_4#27-38]" +
"[(39-44)s:franz|_5#39-44]" +
"[(45-49)s:hans|_6#45-49]" +
"[(50-54)s:haus|_7#50-54]" +
"[(55-59)s:efeu|_8#55-59]" +
"[(60-64)s:effe|_9#60-64]");
ki.addDoc(fd);
ki.commit();
KorapQuery kq = new KorapQuery("base");
SpanQuery sq = kq.contains(
kq.seq(
kq.re("s:a.*e")
).append(
kq.re("s:af*e")
),
kq.seg("s:affe")).toQuery();
assertEquals("spanContain(spanNext(SpanMultiTermQueryWrapper(base:/s:a.*e/), SpanMultiTermQueryWrapper(base:/s:af*e/)), base:s:affe)", sq.toString());
KorapSearch ks = new KorapSearch(sq);
ks.context.left.setToken(true).setLength(1);
ks.context.right.setToken(true).setLength(1);
KorapResult kr = ki.search(ks);
assertEquals(1, kr.getTotalResults());
assertEquals("[affe afffe] baum ...", kr.getMatch(0).getSnippetBrackets());
};
};