Correct Snippet interpretation (but leave old marker-split algorithm as fallback and for testing)

Change-Id: I9c93b3b881a9e548b064145787d83de98a2a5576
diff --git a/plugin/src/main/java/de/ids_mannheim/korap/plkexport/IdsExportService.java b/plugin/src/main/java/de/ids_mannheim/korap/plkexport/IdsExportService.java
index b81bc51..e8b2828 100644
--- a/plugin/src/main/java/de/ids_mannheim/korap/plkexport/IdsExportService.java
+++ b/plugin/src/main/java/de/ids_mannheim/korap/plkexport/IdsExportService.java
@@ -57,6 +57,7 @@
  * - Add progress mechanism.
  * - Add CSV export format.
  * - Add table layout to RTF information.
+ * - Add loading marker.
  */
 
 @Path("/")
diff --git a/plugin/src/main/java/de/ids_mannheim/korap/plkexport/RtfExporter.java b/plugin/src/main/java/de/ids_mannheim/korap/plkexport/RtfExporter.java
index 0862304..e5122b3 100644
--- a/plugin/src/main/java/de/ids_mannheim/korap/plkexport/RtfExporter.java
+++ b/plugin/src/main/java/de/ids_mannheim/korap/plkexport/RtfExporter.java
@@ -61,7 +61,7 @@
             .append("{\\colortbl;\\red0\\green0\\blue0;\\red127\\green127\\blue127;}\n")
             .append("{\\fonttbl{\\f0\\fcharset0 Times New Roman;}{\\f1\\fcharset1 Courier;}}\n");
 
-        w.append("{\\footer\\pard\\ql\\fs18\\f0 ");
+        w.append("{\\footer\\pard\\qr\\fs18\\f0 ");
         rtfText(w, "@ Institut für Deutsche Sprache, Mannheim");
 
         // Page number
@@ -131,9 +131,9 @@
                 w.append("[...] ");
             };
             rtfText(w, s.getLeft());
-            w.append(" {\\b ");
+            w.append("{\\b ");
             rtfText(w, s.getMark());
-            w.append("} ");
+            w.append("}");
             rtfText(w, s.getRight());
             if (s.hasMoreRight()) {
                 w.append(" [...]");
diff --git a/plugin/src/main/java/de/ids_mannheim/korap/plkexport/Snippet.java b/plugin/src/main/java/de/ids_mannheim/korap/plkexport/Snippet.java
index 678a562..06b8ca0 100644
--- a/plugin/src/main/java/de/ids_mannheim/korap/plkexport/Snippet.java
+++ b/plugin/src/main/java/de/ids_mannheim/korap/plkexport/Snippet.java
@@ -1,36 +1,69 @@
 package de.ids_mannheim.korap.plkexport;
 
 import java.util.regex.Pattern;
+import java.util.regex.Matcher;
 
 public class Snippet {
 
     private String left, right, mark;
     private boolean leftMore, rightMore;
 
-    private static Pattern leftMoreP =
-        Pattern.compile("(?i)<span[^>]*?class=\"more\".+<mark>");
-    private static Pattern rightMoreP =
-        Pattern.compile("(?i)</mark>.+<span[^>]*?class=\"more\"");
+    private static Pattern snippetP =
+        Pattern.compile("^(?i)<span[^>]+class=\"(?:[^\"]* )?context-left(?:[^\"]* )?\">(.*?)</span>" +
+                        "<span[^>]+class=\"(?:[^\"]* )?match(?:[^\"]* )?\">(.+?)</span>" +
+                        "<span[^>]+class=\"(?:[^\"]* )?context-right(?:[^\"]* )?\">(.*?)</span>$");   
+
+    private static Pattern moreP =
+        Pattern.compile("(?i)<span[^>]+class=\"more\"></span>");
 
     public Snippet (String snippetstr) {
 
-        // Check the context
-        this.leftMore = this.rightMore = false;
-        if (leftMoreP.matcher(snippetstr).find()) {
-            this.leftMore = true;
-        };
-        if (rightMoreP.matcher(snippetstr).find()) {
-            this.rightMore = true;
-        };
+        // Match with precise algorithm
+        String left, right;
+        Matcher m = snippetP.matcher(snippetstr);
+        if (m.find()) {
+            left = m.group(1);
+            mark = m.group(2);
+            right = m.group(3);
 
-        // Split the match
-        String[] split = snippetstr
-            .replaceAll("(?i)</?span[^>]*>", "")
-            .split("</?mark>");
+            if (left != null) {
+                m = moreP.matcher(left);
+                if (m.find()) {
+                    left = m.replaceAll("");
+                    this.leftMore = true;
+                };
+                this.setLeft(unescapeHTML(left));
+            };
 
-        this.setLeft(unescapeHTML(split[0].trim()));
-        this.setMark(unescapeHTML(split[1].trim()));
-        this.setRight(unescapeHTML(split[2].trim()));
+            this.setMark(unescapeHTML(mark.replaceAll("</?mark[^>]*>", "")));
+
+            if (right != null) {
+                m = moreP.matcher(right);
+                if (m.find()) {
+                    right = m.replaceAll("");
+                    this.rightMore = true;
+                };
+                this.setRight(unescapeHTML(right));
+            };
+        }
+
+        // Simpler mark-split algorithm
+        else {
+            String[] splitted = snippetstr
+                .replaceAll("(?i)</?span[^>]*>","")
+                .split("(?i)</?mark[^>]*>");
+            if (splitted[0] != null) {
+                this.setLeft(splitted[0]);
+            };
+            if (splitted[1] != null) {
+                this.setMark(splitted[1]);
+            };
+            if (splitted[2] != null) {
+                this.setRight(splitted[2]);
+            };
+            
+            return;
+        };
     }
 
     public String getLeft () {
diff --git a/plugin/src/test/java/de/ids_mannheim/korap/plkexport/RtfExportTest.java b/plugin/src/test/java/de/ids_mannheim/korap/plkexport/RtfExportTest.java
index c191f18..e979cc1 100644
--- a/plugin/src/test/java/de/ids_mannheim/korap/plkexport/RtfExportTest.java
+++ b/plugin/src/test/java/de/ids_mannheim/korap/plkexport/RtfExportTest.java
@@ -23,7 +23,7 @@
         Response resp = rtf.serve().build();
         String x = (String) resp.getEntity();
         resp.close();
-        assertTrue(x.contains("\\footer\\pard\\ql\\fs18\\f0 @ Institut"));
+        assertTrue(x.contains("\\footer\\pard\\qr\\fs18\\f0 @ Institut"));
         assertTrue(x.contains("Institut f\\u252\\'fcr Deutsche"));
     };
 
@@ -31,21 +31,22 @@
     public void testInitFull () throws IOException {
         RtfExporter rtf = new RtfExporter();
         rtf.init("{\"meta\":\"ja\",\"collection\":\"hm\",\"query\":\"cool\"," +
-                  "\"matches\":["+
-                  "{\"author\":\"Goethe\","+
-                  "\"title\":\"Title1\","+
-                  "\"pubDate\":\"20051103\","+
-                  "\"textSigle\":\"RTF/G59/34284\","+
-                  "\"snippet\":\"Simple <mark>match1</mark> Snippet\"}"+
-                  ","+
-                  "{\"author\":\"Schiller\","+
-                  "\"title\":\"Title2\","+
-                  "\"pubDate\":\"20051104\","+
-                  "\"textSigle\":\"RTF/G59/34285\","+
-                  "\"snippet\":\"<span class=\\\"context-left\\\"><span class=\\\"more\\\"></span>"+
-                  "Simpler <mark>&quot;match2&quot;</mark> Snippet"+
-                  "<span class=\\\"more\\\"></span></span>\"}"+
-                  "]}");
+                 "\"matches\":["+
+                 "{\"author\":\"Goethe\","+
+                 "\"title\":\"Title1\","+
+                 "\"pubDate\":\"20051103\","+
+                 "\"textSigle\":\"RTF/G59/34284\","+
+                 "\"snippet\":\"Simple <mark>match1</mark> Snippet\"}"+
+                 ","+
+                 "{\"author\":\"Schiller\","+
+                 "\"title\":\"Title2\","+
+                 "\"pubDate\":\"20051104\","+
+                 "\"textSigle\":\"RTF/G59/34285\","+
+                 "\"snippet\":\"<span class=\\\"context-left\\\"><span class=\\\"more\\\"></span>"+
+                 "Simpler </span><span class=\\\"match\\\"><mark>&quot;match2&quot;</mark></span>"+
+                 "<span class=\\\"context-right\\\"> Snippet"+
+                 "<span class=\\\"more\\\"></span></span>\"}"+
+                 "]}");
 
         Response resp = rtf.serve().build();
         String x = (String) resp.getEntity();
diff --git a/plugin/src/test/java/de/ids_mannheim/korap/plkexport/SnippetTest.java b/plugin/src/test/java/de/ids_mannheim/korap/plkexport/SnippetTest.java
index 18ded61..746d3dc 100644
--- a/plugin/src/test/java/de/ids_mannheim/korap/plkexport/SnippetTest.java
+++ b/plugin/src/test/java/de/ids_mannheim/korap/plkexport/SnippetTest.java
@@ -16,8 +16,8 @@
     @Test
     public void testSimple () {
         Snippet s = new Snippet("<span class=\"context-left\">Der </span><span class=\"match\"><mark>Plagegeist</mark></span><span class=\"context-right\"> ging um</span>");
-        assertEquals(s.getLeft(),"Der");
-        assertEquals(s.getRight(),"ging um");
+        assertEquals(s.getLeft(),"Der ");
+        assertEquals(s.getRight()," ging um");
         assertEquals(s.getMark(),"Plagegeist");
         assertFalse(s.hasMoreLeft());
         assertFalse(s.hasMoreRight());
@@ -39,4 +39,14 @@
         assertTrue(s.hasMoreLeft());
         assertTrue(s.hasMoreRight());
     };
+
+    @Test
+    public void testMultipleMarks () {
+        Snippet s = new Snippet("<span class=\"context-left\"><span class=\"more\"></span>Figueras (gegen 1030, Kopialbuch der Abtei von Saint-Pé-de-Bigorre),  Figeres (1154, laut Pierre de Marcas Buch Histoire de Béarn),  Figueres (1421, Urkunden der Vicomté von Béarn),  Higueres (1750, Karte von Cassini),  Higueres (1793, Notice Communale) und   Higueres und Higuères (1801, Bulletin </span><span class=\"match\"><mark><mark class=\"class-2 level-0\">des <mark class=\"class-1 level-1\">lois</mark></mark><mark class=\"class-1 level-1\">). Toponyme</mark></mark></span><span class=\"context-right\"> und Erwähnungen von Souye waren:  Soyge und Soya (1538 bzw. 1547, Manuskriptsammlung des 16. bis 18. Jahrhunderts),  Souia (1645, Volkszählung von Morlaàs),  Souge und Souie (1675 bzw. 1682, Manuskriptsammlung des 16. bis 18. Jahrhunderts),  Souge (1750, Karte von Cassini),  Souye<span class=\"more\"></span></span>");
+        assertEquals(s.getLeft(), "Figueras (gegen 1030, Kopialbuch der Abtei von Saint-Pé-de-Bigorre),  Figeres (1154, laut Pierre de Marcas Buch Histoire de Béarn),  Figueres (1421, Urkunden der Vicomté von Béarn),  Higueres (1750, Karte von Cassini),  Higueres (1793, Notice Communale) und   Higueres und Higuères (1801, Bulletin ");
+        assertEquals(s.getRight()," und Erwähnungen von Souye waren:  Soyge und Soya (1538 bzw. 1547, Manuskriptsammlung des 16. bis 18. Jahrhunderts),  Souia (1645, Volkszählung von Morlaàs),  Souge und Souie (1675 bzw. 1682, Manuskriptsammlung des 16. bis 18. Jahrhunderts),  Souge (1750, Karte von Cassini),  Souye");
+        assertEquals(s.getMark(),"des lois). Toponyme");
+        assertTrue(s.hasMoreLeft());
+        assertTrue(s.hasMoreRight());
+    };
 };