Improved fuzzing for annotated documents and documents with sentences
Change-Id: I8b7c780c313937d1a168868b6e72f8d934460351
diff --git a/Changes b/Changes
index 1e9ff8b..6dcb6f7 100644
--- a/Changes
+++ b/Changes
@@ -1,5 +1,7 @@
-0.60.4 2022-03-30
+0.60.4 2022-08-10
- [security] Update dependencies (diewald)
+ - [feature] Improved fuzzing for annotated documents
+ including sentences (diewald)
0.60.3 2022-03-30
- [cleanup] Updated fingerprints to base64url
diff --git a/src/test/java/de/ids_mannheim/korap/TestSimple.java b/src/test/java/de/ids_mannheim/korap/TestSimple.java
index 0243250..f6e25a8 100644
--- a/src/test/java/de/ids_mannheim/korap/TestSimple.java
+++ b/src/test/java/de/ids_mannheim/korap/TestSimple.java
@@ -9,6 +9,7 @@
import java.io.InputStreamReader;
import java.net.URLDecoder;
import java.util.ArrayList;
+import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
@@ -142,6 +143,63 @@
return fd;
};
+
+ // Create a new FieldDocument with random data
+ public static FieldDocument annotatedFuzzyWithSentencesFieldDoc (List<String> chars, int minLength, int maxLength) {
+ FieldDocument fd = new FieldDocument();
+ String annotation = "";
+ String surface = "";
+
+ int l = (int)(Math.random() * (maxLength - minLength)) + minLength;
+
+ boolean sentences[] = new boolean[l];
+ Arrays.fill(sentences, true);
+ sentences[0] = true;
+
+ for (int i = 1; i < l; i++) {
+ if (Math.random() > 0.7) {
+ sentences[i] = true;
+ };
+ };
+
+ for (int i = 0; i < l; i++) {
+ String fixChar = chars.get((int)(Math.random() * chars.size()));
+ surface += fixChar;
+ annotation += "[("+i+"-"+(i+1)+")s:"+fixChar;
+ if (i == 0)
+ annotation += "|<>:base/s:t$<b>64<i>0<i>" + l + "<i>" + l + "<b>0";
+
+ for (int j = 0; j < (int)(Math.random() * 3); j++) {
+ fixChar = chars.get((int)(Math.random() * chars.size()));
+ annotation += "|a:" + fixChar;
+ };
+
+ if (sentences[i]) {
+ int sl = 0;
+ if (i != l - 1) {
+ for (int x = i+1; x < l; x++) {
+ if (sentences[x]) {
+ sl = x - 1;
+ break;
+ };
+ }
+ };
+ if (sl == 0)
+ sl = l;
+
+ annotation += "|<>:base/s:s$<b>64<i>" + sl + "<i>" + sl + "<b>1";
+ };
+
+ annotation += "|_"+i+"$<i>"+i+"<i>"+(i+1)+"]";
+ };
+
+
+ fd.addTV("base",surface, annotation);
+ fd.addString("copy", annotation);
+ return fd;
+ };
+
+
// Get Term Vector
public static MultiTermTokenStream getTermVector (String stream) {
MultiTermTokenStream ts = new MultiTermTokenStream();
@@ -244,7 +302,7 @@
// Simple fuzzing test
public static void fuzzingTest (List<String> chars, Pattern resultPattern,
- SpanQuery sq, int minTextLength, int maxTextLength, int maxDocs)
+ SpanQuery sq, int minTextLength, int maxTextLength, int maxDocs, int docType)
throws IOException, QueryException {
Krill ks = new Krill(sq);
@@ -258,8 +316,20 @@
// Create a corpus of <= maxDocs fuzzy docs
for (int i = 0; i < (int) (Math.random() * maxDocs); i++) {
- FieldDocument testDoc = simpleFuzzyFieldDoc(chars,
+ FieldDocument testDoc;
+ if (docType == 1) {
+ testDoc = annotatedFuzzyFieldDoc(
+ chars,
minTextLength, maxTextLength);
+ } else if (docType == 1) {
+ testDoc = annotatedFuzzyWithSentencesFieldDoc(
+ chars,
+ minTextLength, maxTextLength);
+ } else {
+ testDoc = simpleFuzzyFieldDoc(
+ chars,
+ minTextLength, maxTextLength);
+ };
String testString = testDoc.doc.getField("base").stringValue();
Matcher m = resultPattern.matcher(testString);
list.add(testString);
diff --git a/src/test/java/de/ids_mannheim/korap/index/TestNextIndex.java b/src/test/java/de/ids_mannheim/korap/index/TestNextIndex.java
index ddd539c..71a3306 100644
--- a/src/test/java/de/ids_mannheim/korap/index/TestNextIndex.java
+++ b/src/test/java/de/ids_mannheim/korap/index/TestNextIndex.java
@@ -45,7 +45,7 @@
Pattern resultPattern = Pattern.compile("cca");
TestSimple.fuzzingTest(chars, resultPattern, snq2,
- 5, 10, 8);
+ 5, 10, 8, 0);
}
@Test
diff --git a/src/test/java/de/ids_mannheim/korap/index/TestSpanExpansionIndex.java b/src/test/java/de/ids_mannheim/korap/index/TestSpanExpansionIndex.java
index 8103dde..9d5806c 100644
--- a/src/test/java/de/ids_mannheim/korap/index/TestSpanExpansionIndex.java
+++ b/src/test/java/de/ids_mannheim/korap/index/TestSpanExpansionIndex.java
@@ -62,7 +62,7 @@
Pattern resultPattern = Pattern.compile("c[a-e]{0,2}a");
TestSimple.fuzzingTest(chars, resultPattern, snq,
- 6, 20, 8);
+ 6, 20, 8, 0);
}
@Test