Improved fuzzing for annotated documents and documents with sentences

Change-Id: I8b7c780c313937d1a168868b6e72f8d934460351
diff --git a/Changes b/Changes
index 1e9ff8b..6dcb6f7 100644
--- a/Changes
+++ b/Changes
@@ -1,5 +1,7 @@
-0.60.4 2022-03-30
+0.60.4 2022-08-10
     - [security] Update dependencies (diewald)
+    - [feature] Improved fuzzing for annotated documents
+      including sentences (diewald)
 
 0.60.3 2022-03-30
     - [cleanup] Updated fingerprints to base64url
diff --git a/src/test/java/de/ids_mannheim/korap/TestSimple.java b/src/test/java/de/ids_mannheim/korap/TestSimple.java
index 0243250..f6e25a8 100644
--- a/src/test/java/de/ids_mannheim/korap/TestSimple.java
+++ b/src/test/java/de/ids_mannheim/korap/TestSimple.java
@@ -9,6 +9,7 @@
 import java.io.InputStreamReader;
 import java.net.URLDecoder;
 import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
@@ -142,6 +143,63 @@
         return fd;
     };
 
+
+    // Create a new FieldDocument with random data
+    public static FieldDocument annotatedFuzzyWithSentencesFieldDoc (List<String> chars, int minLength, int maxLength) {
+        FieldDocument fd = new FieldDocument();
+        String annotation = "";
+        String surface = "";
+
+        int l = (int)(Math.random() * (maxLength - minLength)) + minLength;
+
+        boolean sentences[] = new boolean[l];
+        Arrays.fill(sentences, true);
+        sentences[0] = true;
+
+        for (int i = 1; i < l; i++) {
+            if (Math.random() > 0.7) {
+                sentences[i] = true;
+            };
+        };
+        
+        for (int i = 0; i < l; i++) {
+            String fixChar = chars.get((int)(Math.random() * chars.size()));
+            surface += fixChar;
+            annotation += "[("+i+"-"+(i+1)+")s:"+fixChar;
+            if (i == 0)
+                annotation += "|<>:base/s:t$<b>64<i>0<i>" + l + "<i>" + l + "<b>0";
+
+            for (int j = 0; j < (int)(Math.random() * 3); j++) {
+                fixChar = chars.get((int)(Math.random() * chars.size()));
+                annotation += "|a:" + fixChar;
+            };
+
+            if (sentences[i]) {
+                int sl = 0;
+                if (i != l - 1) {
+                    for (int x = i+1; x < l; x++) {
+                        if (sentences[x]) {
+                            sl = x - 1;
+                            break;
+                        };
+                    }
+                };
+                if (sl == 0)
+                    sl = l;
+
+                annotation += "|<>:base/s:s$<b>64<i>" + sl + "<i>" + sl + "<b>1";
+            };
+
+            annotation += "|_"+i+"$<i>"+i+"<i>"+(i+1)+"]";
+        };
+
+        
+        fd.addTV("base",surface, annotation);
+        fd.addString("copy", annotation);
+        return fd;
+    };
+
+    
     // Get Term Vector
     public static MultiTermTokenStream getTermVector (String stream) {
         MultiTermTokenStream ts = new MultiTermTokenStream();
@@ -244,7 +302,7 @@
 
     // Simple fuzzing test
     public static void fuzzingTest (List<String> chars, Pattern resultPattern,
-            SpanQuery sq, int minTextLength, int maxTextLength, int maxDocs)
+                                    SpanQuery sq, int minTextLength, int maxTextLength, int maxDocs, int docType)
             throws IOException, QueryException {
 
         Krill ks = new Krill(sq);
@@ -258,8 +316,20 @@
 
             // Create a corpus of <= maxDocs fuzzy docs
             for (int i = 0; i < (int) (Math.random() * maxDocs); i++) {
-                FieldDocument testDoc = simpleFuzzyFieldDoc(chars,
+                FieldDocument testDoc;
+                if (docType == 1) {
+                    testDoc = annotatedFuzzyFieldDoc(
+                        chars,
                         minTextLength, maxTextLength);
+                } else if (docType == 1) {
+                    testDoc = annotatedFuzzyWithSentencesFieldDoc(
+                        chars,
+                        minTextLength, maxTextLength);
+                } else {
+                    testDoc = simpleFuzzyFieldDoc(
+                        chars,
+                        minTextLength, maxTextLength);
+                };
                 String testString = testDoc.doc.getField("base").stringValue();
                 Matcher m = resultPattern.matcher(testString);
                 list.add(testString);
diff --git a/src/test/java/de/ids_mannheim/korap/index/TestNextIndex.java b/src/test/java/de/ids_mannheim/korap/index/TestNextIndex.java
index ddd539c..71a3306 100644
--- a/src/test/java/de/ids_mannheim/korap/index/TestNextIndex.java
+++ b/src/test/java/de/ids_mannheim/korap/index/TestNextIndex.java
@@ -45,7 +45,7 @@
 
         Pattern resultPattern = Pattern.compile("cca");
         TestSimple.fuzzingTest(chars, resultPattern, snq2,
-                5, 10, 8);
+                               5, 10, 8, 0);
     }
     
     @Test
diff --git a/src/test/java/de/ids_mannheim/korap/index/TestSpanExpansionIndex.java b/src/test/java/de/ids_mannheim/korap/index/TestSpanExpansionIndex.java
index 8103dde..9d5806c 100644
--- a/src/test/java/de/ids_mannheim/korap/index/TestSpanExpansionIndex.java
+++ b/src/test/java/de/ids_mannheim/korap/index/TestSpanExpansionIndex.java
@@ -62,7 +62,7 @@
 
         Pattern resultPattern = Pattern.compile("c[a-e]{0,2}a");
         TestSimple.fuzzingTest(chars, resultPattern, snq,
-                6, 20, 8);
+                               6, 20, 8, 0);
     }
     
     @Test