Fixed escaping in snippets (HTML and brackets)
Change-Id: I51b4b44998e6bfb0750e716c82d57ea5a820c741
diff --git a/Changes b/Changes
index 6bbcacf..e643469 100644
--- a/Changes
+++ b/Changes
@@ -1,4 +1,4 @@
-0.55.6 2016-06-29
+0.55.6 2016-07-27
- [bugfix] distance with key "t" uses default foundry (diewald)
- [cleanup] Renamed fromJson() to fromKoral() (diewald)
- [cleanup] Removed deprecated methods in Krill:
@@ -13,6 +13,7 @@
- [feature] Added document method to Web-API (diewald)
- [feature] Added experimental KrillStats class (diewald)
- [bugfix] Escape quotes in JSON strings (diewald)
+ - [bugfix] Escape XML and Brackets in snippets correctly (diewald)
0.55.5 2016-05-02
- [performance] Changed to a dynamic window for sorting in FocusSpans (margaretha)
diff --git a/src/main/java/de/ids_mannheim/korap/response/match/HighlightCombinatorElement.java b/src/main/java/de/ids_mannheim/korap/response/match/HighlightCombinatorElement.java
index 55837b8..42f1c08 100644
--- a/src/main/java/de/ids_mannheim/korap/response/match/HighlightCombinatorElement.java
+++ b/src/main/java/de/ids_mannheim/korap/response/match/HighlightCombinatorElement.java
@@ -58,7 +58,7 @@
else if (this.number < -1) {
sb.append("<span xml:id=\"")
- .append(match.getPosID(match.getClassID(this.number)))
+ .append(escapeHTML(match.getPosID(match.getClassID(this.number))))
.append("\">");
}
@@ -66,14 +66,14 @@
sb.append("<span ");
if (this.number < 2048) {
sb.append("title=\"")
- .append(match.getAnnotationID(this.number))
+ .append(escapeHTML(match.getAnnotationID(this.number)))
.append('"');
}
else {
Relation rel = match.getRelationID(this.number);
- sb.append("xlink:title=\"").append(rel.annotation)
+ sb.append("xlink:title=\"").append(escapeHTML(rel.annotation))
.append("\" xlink:type=\"simple\" xlink:href=\"#")
- .append(match.getPosID(rel.ref)).append('"');
+ .append(escapeHTML(match.getPosID(rel.ref))).append('"');
};
sb.append('>');
}
@@ -146,6 +146,7 @@
else if (this.number != 0)
sb.append(this.number).append(':');
};
+
return sb.toString();
}
else if (this.type == 2) {
@@ -153,6 +154,6 @@
return "]";
return "}";
};
- return this.characters;
+ return escapeBrackets(this.characters);
};
};
diff --git a/src/main/java/de/ids_mannheim/korap/response/match/MatchIdentifier.java b/src/main/java/de/ids_mannheim/korap/response/match/MatchIdentifier.java
index b41cd2e..10f0303 100644
--- a/src/main/java/de/ids_mannheim/korap/response/match/MatchIdentifier.java
+++ b/src/main/java/de/ids_mannheim/korap/response/match/MatchIdentifier.java
@@ -8,9 +8,10 @@
private ArrayList<int[]> pos = new ArrayList<>(8);
- // TODO: "contains" is necessary for a compatibility bug in Kustvakt
- Pattern idRegex = Pattern.compile("^(?:match-|contains-)"
- + "(?:([^!]+?)[!\\.])?" + "([^!]+)-p([0-9]+)-([0-9]+)"
+ // Remember: "contains" is necessary for a compatibility bug in Kustvakt
+ Pattern idRegex = Pattern.compile("^(?:match-|contains-)" +
+ "(?:([^!]+?)[!\\.])?" +
+ "([^!]+)[-/]p([0-9]+)-([0-9]+)"
+ "((?:\\(-?[0-9]+\\)-?[0-9]+--?[0-9]+)*)" + "(?:c.+?)?$");
Pattern posRegex = Pattern.compile("\\(([0-9]+)\\)([0-9]+)-([0-9]+)");
@@ -18,6 +19,12 @@
public MatchIdentifier () {};
+ /**
+ * Construct a new MatchIdentifier.
+ * Due to lots of internal changes and compatibility reasons,
+ * the structure of the identifier has changed a lot.
+ * The constructor supports different legacy structures for test compatibility.
+ */
public MatchIdentifier (String id) {
// Replace for legacy reasons with incompatible versions of Kustvakt
@@ -26,17 +33,25 @@
Matcher matcher = idRegex.matcher(id);
if (matcher.matches()) {
+ // textSigle is provided directly
+ if (matcher.group(1) == null && id.contains("/")) {
+ // Todo: potentially use UID!
+ this.setTextSigle(matcher.group(2));
+ }
+
// <legacy>
- // and test compatibility
- if (id.contains("!") || !id.contains("_")) {
+ else if (id.contains("!") || !id.contains("_")) {
this.setCorpusID(matcher.group(1));
this.setDocID(matcher.group(2));
}
- // </legacy>
+ // </legacy>
+
+ // textSigle is provided indirectly
+ // <legacy>
else {
- // this.getCorpusID() + "." + this.getDocID()
this.setTextSigle(matcher.group(1) + '.' + matcher.group(2));
};
+ // </legacy>
this.setStartPos(Integer.parseInt(matcher.group(3)));
this.setEndPos(Integer.parseInt(matcher.group(4)));
diff --git a/src/main/java/de/ids_mannheim/korap/util/KrillString.java b/src/main/java/de/ids_mannheim/korap/util/KrillString.java
index cc1f357..8a11d33 100644
--- a/src/main/java/de/ids_mannheim/korap/util/KrillString.java
+++ b/src/main/java/de/ids_mannheim/korap/util/KrillString.java
@@ -58,6 +58,17 @@
/**
+ * Escape Bracket relevant characters.
+ *
+ * @param text
+ * The string to escape.
+ * @return The secured string.
+ */
+ public static String escapeBrackets (String text) {
+ return text.replaceAll("([\\{\\}\\[\\]\\\\])", "\\\\$1");
+ };
+
+ /**
* Add surrounding double quotes.
*
* @param text
diff --git a/src/test/java/de/ids_mannheim/korap/highlight/TestHighlight.java b/src/test/java/de/ids_mannheim/korap/highlight/TestHighlight.java
index c2da28b..d363d7d 100644
--- a/src/test/java/de/ids_mannheim/korap/highlight/TestHighlight.java
+++ b/src/test/java/de/ids_mannheim/korap/highlight/TestHighlight.java
@@ -351,4 +351,54 @@
assertEquals(kr.getError(0).getMessage(),
"Valid class numbers exceeded");
};
+
+ @Test
+ public void highlightEscapes () throws IOException, QueryException {
+ KrillIndex ki = new KrillIndex();
+ FieldDocument fd = new FieldDocument();
+ fd.addString("ID", "doc-1");
+ fd.addString("UID", "1");
+ fd.addString("textSigle", "c1/d1/1");
+
+ // Make this clean for HTML and Brackets!
+
+ fd.addTV("base", "Mit \"Mann\" & {Ma\\us}",
+ "[(0-3)s:Mit|i:mit|_0#0-3|-:t$<i>4|<>:base/t:t$<b>64<i>0<i>20<i>4<b>0]" +
+ "[(4-10)s:\"Mann\"|i:\"mann\"|base/l:\"Mann\"|_1#4-10]" +
+ "[(11-12)s:&|i:&|base/l:&|_2#11-12]" +
+ "[(13-20)s:{Ma\\us}|i:{ma\\us}|_3#13-20]");
+ ki.addDoc(fd);
+
+ // Commit!
+ ki.commit();
+ QueryBuilder kq = new QueryBuilder("base");
+ SpanQuery q = (SpanQuery) kq.tag("base/t:t").toQuery();
+
+ Krill qs = new Krill(q);
+ qs.getMeta().getContext().left.setToken(true).setLength((short) 0);
+ qs.getMeta().getContext().right.setToken(true).setLength((short) 0);
+
+ Result kr = ki.search(qs);
+ assertEquals((long) 1, kr.getTotalResults());
+ assertEquals("[Mit \"Mann\" & \\{Ma\\\\us\\}]", kr.getMatch(0).getSnippetBrackets());
+ assertEquals("<span class=\"context-left\"></span><mark>Mit "Mann" & {Ma\\us}</mark><span class=\"context-right\"></span>", kr.getMatch(0).getSnippetHTML());
+ assertEquals("match-c1/d1/1-p0-4", kr.getMatch(0).getID());
+
+ Match km = ki.getMatchInfo("match-c1/d1/1-p0-4", "base", true, (ArrayList) null, (ArrayList) null, true, true, false);
+ assertEquals(0, km.getStartPos());
+ assertEquals("<span class=\"context-left\"></span>"+
+ "<mark><span title=\"base/t:t\">"+
+ "Mit "+
+ "<span title=\"base/l:"Mann"\">"+
+ ""Mann""+
+ "</span>"+
+ " "+
+ "<span title=\"base/l:&\">&</span>"+
+ " "+
+ "{Ma\\us}"+
+ "</span>"+
+ "</mark>"+
+ "<span class=\"context-right\"></span>",
+ km.getSnippetHTML());
+ };
};
diff --git a/src/test/java/de/ids_mannheim/korap/index/TestMatchIdentifier.java b/src/test/java/de/ids_mannheim/korap/index/TestMatchIdentifier.java
index ae08cc4..03bafcc 100644
--- a/src/test/java/de/ids_mannheim/korap/index/TestMatchIdentifier.java
+++ b/src/test/java/de/ids_mannheim/korap/index/TestMatchIdentifier.java
@@ -90,6 +90,16 @@
assertEquals(null, id.getDocID());
assertEquals("GOE_AGF.02286", id.getTextSigle());
assertEquals("match-GOE_AGF.02286-p2105-2106", id.toString());
+
+ id = new MatchIdentifier("match-corpus-1/doc-1/text-1/p2105-2106");
+ assertEquals("match-corpus-1/doc-1/text-1-p2105-2106", id.toString());
+ assertEquals("corpus-1/doc-1/text-1", id.getTextSigle());
+ /*
+ assertEquals(2105, id.getStartPos());
+ assertEquals(2106, id.getEndPos());
+ assertEquals(null, id.getCorpusID());
+ assertEquals(null, id.getDocID());
+ */
};