Add support for identifiers with dashes (Schreibgebrauch project

Change-Id: Ic177c055a14438415c0bcb0cd45d4788f375042f
diff --git a/src/main/java/de/ids_mannheim/korap/KrillIndex.java b/src/main/java/de/ids_mannheim/korap/KrillIndex.java
index fa4cc7b..fef6888 100644
--- a/src/main/java/de/ids_mannheim/korap/KrillIndex.java
+++ b/src/main/java/de/ids_mannheim/korap/KrillIndex.java
@@ -833,6 +833,9 @@
             boolean includeHighlights, boolean extendToSentence)
             throws QueryException {
 
+        if (DEBUG)
+            log.trace("Get info on {}", idString);
+
         Match match = new Match(idString, includeHighlights);
 
         if (this.getVersion() != null)
@@ -853,14 +856,24 @@
         }
 
         // <legacy>
-        else {
+        else if (match.getDocID() != null) {
             bool.add(new TermQuery(new Term("ID", match.getDocID())),
                     BooleanClause.Occur.MUST);
             bool.add(new TermQuery(new Term("corpusID", match.getCorpusID())),
                     BooleanClause.Occur.MUST);
-        };
+        }
         // </legacy>
 
+        // Invalid
+        else {
+            match.addError(730, "Invalid match identifier", idString);
+            return match;
+        };
+
+        if (DEBUG)
+            log.trace("The bool query is {}", bool.toString());
+
+
         Filter filter = (Filter) new QueryWrapperFilter(bool);
 
         CompiledAutomaton fst = null;
@@ -950,18 +963,22 @@
         };
 
         try {
+
             // Iterate over all atomic indices and find the matching document
             for (LeafReaderContext atomic : this.reader().leaves()) {
 
                 // Retrieve the single document of interest
-                DocIdSet filterSet = filter.getDocIdSet(atomic, atomic.reader()
-                        .getLiveDocs());
+                DocIdSet filterSet = filter.getDocIdSet(atomic, atomic.reader().getLiveDocs());
+
 
                 // Create a bitset for the correct document
                 Bits bitset = filterSet.bits();
 
                 DocIdSetIterator filterIterator = filterSet.iterator();
 
+                if (DEBUG)
+                    log.trace("Checking document in {} with {}", filterSet, bitset);
+
                 // No document found
                 if (filterIterator == null)
                     continue;
@@ -969,6 +986,9 @@
                 // Go to the matching doc - and remember its ID
                 int localDocID = filterIterator.nextDoc();
 
+                if (DEBUG)
+                    log.trace("localDocID is {}", localDocID);
+
                 if (localDocID == DocIdSetIterator.NO_MORE_DOCS)
                     continue;
 
diff --git a/src/main/java/de/ids_mannheim/korap/index/MultiTermTokenStream.java b/src/main/java/de/ids_mannheim/korap/index/MultiTermTokenStream.java
index 32e3242..f02b074 100644
--- a/src/main/java/de/ids_mannheim/korap/index/MultiTermTokenStream.java
+++ b/src/main/java/de/ids_mannheim/korap/index/MultiTermTokenStream.java
@@ -48,7 +48,7 @@
             .compile("\\[(?:\\([0-9]+-[0-9]+\\))?([^\\]]+?)\\]");
 
     // This advices the java compiler to ignore all loggings
-    public static final boolean DEBUG = true;
+    public static final boolean DEBUG = false;
     private final Logger log = LoggerFactory
             .getLogger(MultiTermTokenStream.class);
 
diff --git a/src/main/java/de/ids_mannheim/korap/response/match/MatchIdentifier.java b/src/main/java/de/ids_mannheim/korap/response/match/MatchIdentifier.java
index 7ebf161..8af5e36 100644
--- a/src/main/java/de/ids_mannheim/korap/response/match/MatchIdentifier.java
+++ b/src/main/java/de/ids_mannheim/korap/response/match/MatchIdentifier.java
@@ -10,8 +10,10 @@
 
     // TODO: "contains" is necessary for a compatibility bug in Kustvakt
     Pattern idRegex = Pattern.compile("^(?:match-|contains-)"
-            + "(?:([^!]+?)[!\\.])?" + "([^!]+)-p([0-9]+)-([0-9]+)"
-            + "((?:\\(-?[0-9]+\\)-?[0-9]+--?[0-9]+)*)" + "(?:c.+?)?$");
+                                      + "(?:([^!]+?)[!\\.])?" +
+                                      "([^!]+)-p([0-9]+)-([0-9]+)" +
+                                      "((?:\\(-?[0-9]+\\)-?[0-9]+--?[0-9]+)*)" +
+                                      "(?:c.+?)?$");
     Pattern posRegex = Pattern.compile("\\(([0-9]+)\\)([0-9]+)-([0-9]+)");
 
 
@@ -21,7 +23,7 @@
     public MatchIdentifier (String id) {
 
         // Replace for legacy reasons with incompatible versions of Kustvakt
-        id = id.replaceAll("^(contains-|match-)([^-!_\\.]+?)!\\2_", "$1$2_");
+        id = id.replaceAll("^(contains-|match-)([^!_\\.]+?)!\\2_", "$1$2_");
 
         Matcher matcher = idRegex.matcher(id);
         if (matcher.matches()) {
diff --git a/src/test/java/de/ids_mannheim/korap/index/TestMatchIdentifier.java b/src/test/java/de/ids_mannheim/korap/index/TestMatchIdentifier.java
index 796a6d9..a9bb540 100644
--- a/src/test/java/de/ids_mannheim/korap/index/TestMatchIdentifier.java
+++ b/src/test/java/de/ids_mannheim/korap/index/TestMatchIdentifier.java
@@ -28,7 +28,6 @@
 
     ObjectMapper mapper = new ObjectMapper();
 
-
     @Test
     public void identifierExample1 () throws IOException, QueryException {
         MatchIdentifier id = new MatchIdentifier("match-c1!d1-p4-20");
@@ -550,6 +549,17 @@
 
 
     @Test
+    public void indexFailingMatchID () throws IOException,
+            QueryException {
+        KrillIndex ki = new KrillIndex();
+        Match km = ki.getMatchInfo("match-PRO-DUD!PRO-DUD_KSTA-2013-01.7483-2013-01",
+                                   "tokens", "*", "m",
+                                   false, false);
+        JsonNode res = mapper.readTree(km.toJsonString());
+        assertEquals("730", res.at("/errors/0/0").asText());
+    };
+
+    @Test
     public void indexExampleNullInfo () throws IOException, QueryException {
         KrillIndex ki = new KrillIndex();
         ki.addDoc(createSimpleFieldDoc4());
diff --git a/src/test/java/de/ids_mannheim/korap/response/TestMatch.java b/src/test/java/de/ids_mannheim/korap/response/TestMatch.java
new file mode 100644
index 0000000..182300a
--- /dev/null
+++ b/src/test/java/de/ids_mannheim/korap/response/TestMatch.java
@@ -0,0 +1,37 @@
+package de.ids_mannheim.korap.response;
+
+import de.ids_mannheim.korap.response.Match;
+
+import static org.junit.Assert.*;
+import org.junit.Test;
+import org.junit.Ignore;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+
+@RunWith(JUnit4.class)
+public class TestMatch {
+
+    @Test
+    public void testNoMatch () {
+        Match m = new Match("aaa", false);
+        assertEquals(null, m.getID());
+    };
+
+    @Test
+    public void testMatchBug () {
+        Match m = new Match("match-PRO-DUD!PRO-DUD_KSTA-2013-01.7483-2013-01", false);
+        assertEquals(null, m.getID());
+    };
+
+    @Test
+    public void testMatchTextSigle1 () {
+        Match m = new Match("match-GOE!GOE_AGK.00000-p60348-60349", false);
+        assertEquals("GOE_AGK.00000", m.getTextSigle());
+    };
+
+    @Test
+    public void testMatchTextSigle2 () {
+        Match m = new Match("match-PRO-DUD!PRO-DUD_KSTA-2013-01.3651-p326-327", false);
+        assertEquals("PRO-DUD_KSTA-2013-01.3651", m.getTextSigle());
+    };
+};
diff --git a/src/test/java/de/ids_mannheim/korap/search/TestKrill.java b/src/test/java/de/ids_mannheim/korap/search/TestKrill.java
index e7a6f2f..ed64596 100644
--- a/src/test/java/de/ids_mannheim/korap/search/TestKrill.java
+++ b/src/test/java/de/ids_mannheim/korap/search/TestKrill.java
@@ -1092,4 +1092,29 @@
         assertEquals("WPD_AAA.00002", kr.getMatch(0).getDocID());
         assertEquals(kr.getTotalResults(), 1);
     };
+
+    /**
+     * This is a Schreibgebrauch ressource that didn't work for element queries.
+     */
+    @Test
+    public void searchSchreibgebrauchData () throws IOException {
+        // Construct index
+        KrillIndex ki = new KrillIndex();
+        // Indexing test files
+        ki.addDoc(getClass().getResourceAsStream("/sgbr/BSP-2013-01-32.json.gz"), true);
+        ki.commit();
+
+        Krill k = new Krill(new QueryBuilder("tokens").tag("base/s:s"));
+
+        assertEquals(k.getSpanQuery().toString(),
+                "<tokens:base/s:s />");
+
+        Result kr = k.apply(ki);
+        assertEquals(kr.getTotalResults(), 1);
+        assertEquals(kr.getMatch(0).getSnippetBrackets(),
+                "[Selbst ist der Jeck]");
+
+        assertEquals(kr.getMatch(0).getTextSigle(), "PRO-DUD_BSP-2013-01.32");
+    };
+
 };
diff --git a/src/test/resources/sgbr/BSP-2013-01-32.json b/src/test/resources/sgbr/BSP-2013-01-32.json
new file mode 100644
index 0000000..b059e10
--- /dev/null
+++ b/src/test/resources/sgbr/BSP-2013-01-32.json
@@ -0,0 +1 @@
+{"language":"de","store":{"sgbrKodex":"T","funder":"Bundesministerium für Bildung und Forschung"},"version":"0.03","data":{"tokenSource":"sgbr#lemma","foundries":"base base/sentences dereko dereko/structure sgbr sgbr/lemma sgbr/morpho","text":"Selbst ist der Jeck","layerInfos":"base/s=spans dereko/s=spans sgbr/l=tokens sgbr/lv=tokens sgbr/p=tokens","stream":[["-:base/sentences$<i>1","-:tokens$<i>4","<>:dereko/s:w$<b>64<i>0<i>6<i>1<b>4<s>2","<>:dereko/s:text$<b>64<i>0<i>18<i>3<b>0","<>:dereko/s:div$<b>64<i>0<i>18<i>3<b>1","<>:dereko/s:head$<b>64<i>0<i>18<i>3<b>2","<>:dereko/s:s$<b>64<i>0<i>18<i>3<b>3<s>1","<>:base/s:t$<b>64<i>0<i>19<i>4<b>0","<>:base/s:s$<b>64<i>0<i>19<i>4<b>2","@:dereko/s:n:1$<b>17<s>1<i>3","@:dereko/s:ana:\\#PRO.DUD.BSP.2013.01.POS.NE$<b>17<s>2<i>1","@:dereko/s:n:1$<b>17<s>2<i>1","@:dereko/s:lemmaRef:\\#PRO.DUD.BSP.2013.01.Lemmata.3773$<b>17<s>2<i>1","_0$<i>0<i>6","i:selbst","s:Selbst","sgbr/l:Selbst","sgbr/p:NE"],["<>:dereko/s:w$<b>64<i>7<i>10<i>2<b>4<s>1","@:dereko/s:lemmaRef:\\#PRO.DUD.BSP.2013.01.Lemmata.2$<b>17<s>1<i>2","@:dereko/s:ana:\\#PRO.DUD.BSP.2013.01.POS.VVFIN$<b>17<s>1<i>2","@:dereko/s:n:2$<b>17<s>1<i>2","_1$<i>7<i>10","i:ist","s:ist","sgbr/l:sein","sgbr/p:VVFIN"],["<>:dereko/s:w$<b>64<i>11<i>14<i>3<b>4<s>1","@:dereko/s:lemmaRef:\\#PRO.DUD.BSP.2013.01.Lemmata.3$<b>17<s>1<i>3","@:dereko/s:ana:\\#PRO.DUD.BSP.2013.01.POS.ART$<b>17<s>1<i>3","@:dereko/s:n:3$<b>17<s>1<i>3","_2$<i>11<i>14","i:der","s:der","sgbr/l:d_art","sgbr/p:ART"],["<>:dereko/s:div$<b>65<i>18<i>18<i>3<b>2","<>:dereko/s:p$<b>65<i>18<i>18<i>3<b>3","_3$<i>15<i>19","i:jeck","s:Jeck","sgbr/l:Jeck","sgbr/p:NE"]],"name":"tokens"},"corpusSigle":"PRO-DUD","author":"unbekannt","keywords":"sgbrKodex:T","docTitle":"Korpus zur Beobachtung des Schreibgebrauchs im Deutschen","docSigle":"PRO-DUD_BSP-2013-01","textSigle":"PRO-DUD_BSP-2013-01.32","docSubTitle":"Subkorpus Ortsblatt, Jahrgang 2013, Monat Januar","pubPlace":"Stadtingen","title":"Nur Platt, kein Deutsch","publisher":"Dorfblatt GmbH","pubDate":"20130126"}
\ No newline at end of file
diff --git a/src/test/resources/sgbr/BSP-2013-01-32.json.gz b/src/test/resources/sgbr/BSP-2013-01-32.json.gz
new file mode 100644
index 0000000..d9f206a
--- /dev/null
+++ b/src/test/resources/sgbr/BSP-2013-01-32.json.gz
Binary files differ