Add support for identifiers with dashes (Schreibgebrauch project
Change-Id: Ic177c055a14438415c0bcb0cd45d4788f375042f
diff --git a/src/main/java/de/ids_mannheim/korap/KrillIndex.java b/src/main/java/de/ids_mannheim/korap/KrillIndex.java
index fa4cc7b..fef6888 100644
--- a/src/main/java/de/ids_mannheim/korap/KrillIndex.java
+++ b/src/main/java/de/ids_mannheim/korap/KrillIndex.java
@@ -833,6 +833,9 @@
boolean includeHighlights, boolean extendToSentence)
throws QueryException {
+ if (DEBUG)
+ log.trace("Get info on {}", idString);
+
Match match = new Match(idString, includeHighlights);
if (this.getVersion() != null)
@@ -853,14 +856,24 @@
}
// <legacy>
- else {
+ else if (match.getDocID() != null) {
bool.add(new TermQuery(new Term("ID", match.getDocID())),
BooleanClause.Occur.MUST);
bool.add(new TermQuery(new Term("corpusID", match.getCorpusID())),
BooleanClause.Occur.MUST);
- };
+ }
// </legacy>
+ // Invalid
+ else {
+ match.addError(730, "Invalid match identifier", idString);
+ return match;
+ };
+
+ if (DEBUG)
+ log.trace("The bool query is {}", bool.toString());
+
+
Filter filter = (Filter) new QueryWrapperFilter(bool);
CompiledAutomaton fst = null;
@@ -950,18 +963,22 @@
};
try {
+
// Iterate over all atomic indices and find the matching document
for (LeafReaderContext atomic : this.reader().leaves()) {
// Retrieve the single document of interest
- DocIdSet filterSet = filter.getDocIdSet(atomic, atomic.reader()
- .getLiveDocs());
+ DocIdSet filterSet = filter.getDocIdSet(atomic, atomic.reader().getLiveDocs());
+
// Create a bitset for the correct document
Bits bitset = filterSet.bits();
DocIdSetIterator filterIterator = filterSet.iterator();
+ if (DEBUG)
+ log.trace("Checking document in {} with {}", filterSet, bitset);
+
// No document found
if (filterIterator == null)
continue;
@@ -969,6 +986,9 @@
// Go to the matching doc - and remember its ID
int localDocID = filterIterator.nextDoc();
+ if (DEBUG)
+ log.trace("localDocID is {}", localDocID);
+
if (localDocID == DocIdSetIterator.NO_MORE_DOCS)
continue;
diff --git a/src/main/java/de/ids_mannheim/korap/index/MultiTermTokenStream.java b/src/main/java/de/ids_mannheim/korap/index/MultiTermTokenStream.java
index 32e3242..f02b074 100644
--- a/src/main/java/de/ids_mannheim/korap/index/MultiTermTokenStream.java
+++ b/src/main/java/de/ids_mannheim/korap/index/MultiTermTokenStream.java
@@ -48,7 +48,7 @@
.compile("\\[(?:\\([0-9]+-[0-9]+\\))?([^\\]]+?)\\]");
// This advices the java compiler to ignore all loggings
- public static final boolean DEBUG = true;
+ public static final boolean DEBUG = false;
private final Logger log = LoggerFactory
.getLogger(MultiTermTokenStream.class);
diff --git a/src/main/java/de/ids_mannheim/korap/response/match/MatchIdentifier.java b/src/main/java/de/ids_mannheim/korap/response/match/MatchIdentifier.java
index 7ebf161..8af5e36 100644
--- a/src/main/java/de/ids_mannheim/korap/response/match/MatchIdentifier.java
+++ b/src/main/java/de/ids_mannheim/korap/response/match/MatchIdentifier.java
@@ -10,8 +10,10 @@
// TODO: "contains" is necessary for a compatibility bug in Kustvakt
Pattern idRegex = Pattern.compile("^(?:match-|contains-)"
- + "(?:([^!]+?)[!\\.])?" + "([^!]+)-p([0-9]+)-([0-9]+)"
- + "((?:\\(-?[0-9]+\\)-?[0-9]+--?[0-9]+)*)" + "(?:c.+?)?$");
+ + "(?:([^!]+?)[!\\.])?" +
+ "([^!]+)-p([0-9]+)-([0-9]+)" +
+ "((?:\\(-?[0-9]+\\)-?[0-9]+--?[0-9]+)*)" +
+ "(?:c.+?)?$");
Pattern posRegex = Pattern.compile("\\(([0-9]+)\\)([0-9]+)-([0-9]+)");
@@ -21,7 +23,7 @@
public MatchIdentifier (String id) {
// Replace for legacy reasons with incompatible versions of Kustvakt
- id = id.replaceAll("^(contains-|match-)([^-!_\\.]+?)!\\2_", "$1$2_");
+ id = id.replaceAll("^(contains-|match-)([^!_\\.]+?)!\\2_", "$1$2_");
Matcher matcher = idRegex.matcher(id);
if (matcher.matches()) {
diff --git a/src/test/java/de/ids_mannheim/korap/index/TestMatchIdentifier.java b/src/test/java/de/ids_mannheim/korap/index/TestMatchIdentifier.java
index 796a6d9..a9bb540 100644
--- a/src/test/java/de/ids_mannheim/korap/index/TestMatchIdentifier.java
+++ b/src/test/java/de/ids_mannheim/korap/index/TestMatchIdentifier.java
@@ -28,7 +28,6 @@
ObjectMapper mapper = new ObjectMapper();
-
@Test
public void identifierExample1 () throws IOException, QueryException {
MatchIdentifier id = new MatchIdentifier("match-c1!d1-p4-20");
@@ -550,6 +549,17 @@
@Test
+ public void indexFailingMatchID () throws IOException,
+ QueryException {
+ KrillIndex ki = new KrillIndex();
+ Match km = ki.getMatchInfo("match-PRO-DUD!PRO-DUD_KSTA-2013-01.7483-2013-01",
+ "tokens", "*", "m",
+ false, false);
+ JsonNode res = mapper.readTree(km.toJsonString());
+ assertEquals("730", res.at("/errors/0/0").asText());
+ };
+
+ @Test
public void indexExampleNullInfo () throws IOException, QueryException {
KrillIndex ki = new KrillIndex();
ki.addDoc(createSimpleFieldDoc4());
diff --git a/src/test/java/de/ids_mannheim/korap/response/TestMatch.java b/src/test/java/de/ids_mannheim/korap/response/TestMatch.java
new file mode 100644
index 0000000..182300a
--- /dev/null
+++ b/src/test/java/de/ids_mannheim/korap/response/TestMatch.java
@@ -0,0 +1,37 @@
+package de.ids_mannheim.korap.response;
+
+import de.ids_mannheim.korap.response.Match;
+
+import static org.junit.Assert.*;
+import org.junit.Test;
+import org.junit.Ignore;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+
+@RunWith(JUnit4.class)
+public class TestMatch {
+
+ @Test
+ public void testNoMatch () {
+ Match m = new Match("aaa", false);
+ assertEquals(null, m.getID());
+ };
+
+ @Test
+ public void testMatchBug () {
+ Match m = new Match("match-PRO-DUD!PRO-DUD_KSTA-2013-01.7483-2013-01", false);
+ assertEquals(null, m.getID());
+ };
+
+ @Test
+ public void testMatchTextSigle1 () {
+ Match m = new Match("match-GOE!GOE_AGK.00000-p60348-60349", false);
+ assertEquals("GOE_AGK.00000", m.getTextSigle());
+ };
+
+ @Test
+ public void testMatchTextSigle2 () {
+ Match m = new Match("match-PRO-DUD!PRO-DUD_KSTA-2013-01.3651-p326-327", false);
+ assertEquals("PRO-DUD_KSTA-2013-01.3651", m.getTextSigle());
+ };
+};
diff --git a/src/test/java/de/ids_mannheim/korap/search/TestKrill.java b/src/test/java/de/ids_mannheim/korap/search/TestKrill.java
index e7a6f2f..ed64596 100644
--- a/src/test/java/de/ids_mannheim/korap/search/TestKrill.java
+++ b/src/test/java/de/ids_mannheim/korap/search/TestKrill.java
@@ -1092,4 +1092,29 @@
assertEquals("WPD_AAA.00002", kr.getMatch(0).getDocID());
assertEquals(kr.getTotalResults(), 1);
};
+
+ /**
+ * This is a Schreibgebrauch ressource that didn't work for element queries.
+ */
+ @Test
+ public void searchSchreibgebrauchData () throws IOException {
+ // Construct index
+ KrillIndex ki = new KrillIndex();
+ // Indexing test files
+ ki.addDoc(getClass().getResourceAsStream("/sgbr/BSP-2013-01-32.json.gz"), true);
+ ki.commit();
+
+ Krill k = new Krill(new QueryBuilder("tokens").tag("base/s:s"));
+
+ assertEquals(k.getSpanQuery().toString(),
+ "<tokens:base/s:s />");
+
+ Result kr = k.apply(ki);
+ assertEquals(kr.getTotalResults(), 1);
+ assertEquals(kr.getMatch(0).getSnippetBrackets(),
+ "[Selbst ist der Jeck]");
+
+ assertEquals(kr.getMatch(0).getTextSigle(), "PRO-DUD_BSP-2013-01.32");
+ };
+
};
diff --git a/src/test/resources/sgbr/BSP-2013-01-32.json b/src/test/resources/sgbr/BSP-2013-01-32.json
new file mode 100644
index 0000000..b059e10
--- /dev/null
+++ b/src/test/resources/sgbr/BSP-2013-01-32.json
@@ -0,0 +1 @@
+{"language":"de","store":{"sgbrKodex":"T","funder":"Bundesministerium für Bildung und Forschung"},"version":"0.03","data":{"tokenSource":"sgbr#lemma","foundries":"base base/sentences dereko dereko/structure sgbr sgbr/lemma sgbr/morpho","text":"Selbst ist der Jeck","layerInfos":"base/s=spans dereko/s=spans sgbr/l=tokens sgbr/lv=tokens sgbr/p=tokens","stream":[["-:base/sentences$<i>1","-:tokens$<i>4","<>:dereko/s:w$<b>64<i>0<i>6<i>1<b>4<s>2","<>:dereko/s:text$<b>64<i>0<i>18<i>3<b>0","<>:dereko/s:div$<b>64<i>0<i>18<i>3<b>1","<>:dereko/s:head$<b>64<i>0<i>18<i>3<b>2","<>:dereko/s:s$<b>64<i>0<i>18<i>3<b>3<s>1","<>:base/s:t$<b>64<i>0<i>19<i>4<b>0","<>:base/s:s$<b>64<i>0<i>19<i>4<b>2","@:dereko/s:n:1$<b>17<s>1<i>3","@:dereko/s:ana:\\#PRO.DUD.BSP.2013.01.POS.NE$<b>17<s>2<i>1","@:dereko/s:n:1$<b>17<s>2<i>1","@:dereko/s:lemmaRef:\\#PRO.DUD.BSP.2013.01.Lemmata.3773$<b>17<s>2<i>1","_0$<i>0<i>6","i:selbst","s:Selbst","sgbr/l:Selbst","sgbr/p:NE"],["<>:dereko/s:w$<b>64<i>7<i>10<i>2<b>4<s>1","@:dereko/s:lemmaRef:\\#PRO.DUD.BSP.2013.01.Lemmata.2$<b>17<s>1<i>2","@:dereko/s:ana:\\#PRO.DUD.BSP.2013.01.POS.VVFIN$<b>17<s>1<i>2","@:dereko/s:n:2$<b>17<s>1<i>2","_1$<i>7<i>10","i:ist","s:ist","sgbr/l:sein","sgbr/p:VVFIN"],["<>:dereko/s:w$<b>64<i>11<i>14<i>3<b>4<s>1","@:dereko/s:lemmaRef:\\#PRO.DUD.BSP.2013.01.Lemmata.3$<b>17<s>1<i>3","@:dereko/s:ana:\\#PRO.DUD.BSP.2013.01.POS.ART$<b>17<s>1<i>3","@:dereko/s:n:3$<b>17<s>1<i>3","_2$<i>11<i>14","i:der","s:der","sgbr/l:d_art","sgbr/p:ART"],["<>:dereko/s:div$<b>65<i>18<i>18<i>3<b>2","<>:dereko/s:p$<b>65<i>18<i>18<i>3<b>3","_3$<i>15<i>19","i:jeck","s:Jeck","sgbr/l:Jeck","sgbr/p:NE"]],"name":"tokens"},"corpusSigle":"PRO-DUD","author":"unbekannt","keywords":"sgbrKodex:T","docTitle":"Korpus zur Beobachtung des Schreibgebrauchs im Deutschen","docSigle":"PRO-DUD_BSP-2013-01","textSigle":"PRO-DUD_BSP-2013-01.32","docSubTitle":"Subkorpus Ortsblatt, Jahrgang 2013, Monat Januar","pubPlace":"Stadtingen","title":"Nur Platt, kein Deutsch","publisher":"Dorfblatt GmbH","pubDate":"20130126"}
\ No newline at end of file
diff --git a/src/test/resources/sgbr/BSP-2013-01-32.json.gz b/src/test/resources/sgbr/BSP-2013-01-32.json.gz
new file mode 100644
index 0000000..d9f206a
--- /dev/null
+++ b/src/test/resources/sgbr/BSP-2013-01-32.json.gz
Binary files differ