just saving the next step

Change-Id: I899cb4427ca26e327fb9f3f34fec653dd0770fd7
diff --git a/nkjp2korap.xsl b/nkjp2korap.xsl
index 03f66f3..acc425d 100644
--- a/nkjp2korap.xsl
+++ b/nkjp2korap.xsl
@@ -2,6 +2,7 @@
 <xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
   xmlns:xs="http://www.w3.org/2001/XMLSchema" xmlns:nkjp="http://www.nkjp.pl/ns/1.0"
   xmlns:tei="http://www.tei-c.org/ns/1.0" xmlns:f="func"
+  xmlns:fn="http://www.w3.org/2005/xpath-functions"
   xmlns:map="http://www.w3.org/2005/xpath-functions/map" exclude-result-prefixes="xs f map nkjp tei"
   version="3.0" expand-text="yes">
 
@@ -260,7 +261,11 @@
     <xsl:param name="target" as="xs:string"/>
     
     
-<!-- map the entire document, so that the processing only takes place once, and for fast lookups   -->
+<!-- map the entire document, so that the processing only takes place once, and for fast lookups  
+    
+    MOVE THIS UP TO process_single_sample - to re-use it
+    
+    -->
     <xsl:variable name="map_w" as="map(xs:untypedAtomic,item()+)">
       <xsl:variable name="segs" select="$ann_segmentation.xml/tei:teiCorpus/tei:TEI/tei:text/tei:body/tei:p/tei:s//tei:seg[count(@nkjp:rejected) eq 0]" as="element()+"/>
       <xsl:map>
@@ -276,7 +281,78 @@
       </xsl:map>
     </xsl:variable>
     
-    <xsl:message select="'size: ' || map:size($map_w)"/>
+    <xsl:variable name="map_s-p" as="map(xs:untypedAtomic,item()+)">
+      <xsl:map>
+        <xsl:for-each-group select="map:keys($map_w)" group-by="map:get($map_w, .)[1]">
+          <xsl:sort select="xs:integer(substring-before(substring-after(., 'segm_'), '.'))"
+            order="ascending"/>
+          <xsl:sort select="xs:integer(substring-before(substring-after(., '.'), '-'))"
+            order="ascending"/>
+          <xsl:variable name="current-p-pos" select="fn:position()" as="xs:integer"/>
+          <!--  the above is used in the sentence loop, when we check if it's text-initial -->
+          <xsl:variable name="current-p" select="fn:current-grouping-key()" as="xs:string"/>  <!--xs:untypedAtomic-->
+          <xsl:variable name="p-length" select="
+              sum(for $id in current-group()
+              return
+                string-length(map:get($map_w, $id)[5])) + count(current-group()) - xs:integer(position() ne 1) -
+              count(fn:filter(current-group(), function ($w-id) {
+                map:get($map_w, $w-id)[4] eq false()
+              }))"/>
+          <!--        The general algorithm is: 
+            * count and sum the lengths of all the words
+            * add 'whitespace' for all of them (= count them and add that), and then
+            * subtract whitespace for those of them that are not actually preceded by it
+            and if the 1st word is_preceded_by_ws then subtract 1
+                because identifying that 1st word would require an extra step, we're taking a shortcut via position() - 
+                and that strongly depends on the presence of the xsl:sort instructions -->          
+
+          <xsl:message select="'sum: ' || sum( for $id in current-group() return string-length(map:get($map_w, $id)[5]) )"/>
+          <!--<xsl:message select="for $id in current-group() return (string-length(map:get($map_w, $id)[5]),map:get($map_w, $id)[4] )"/>-->
+          <xsl:message select="'cur-group count: ' || count(fn:current-group())"/>
+          <!--<xsl:message select="fn:for-each(current-group(), function($w-id) { map:get($map_w,$w-id)[4] eq false() } )"></xsl:message>-->
+          <xsl:message select="'subtract:' || count(fn:filter(current-group(), function($w-id) { map:get($map_w,$w-id)[4] eq false() } ))"></xsl:message>
+          <xsl:message select="'position: ' || position() || ', xs:integer(position() ne 1)=' || xs:integer(position() ne 1)"></xsl:message>
+          <xsl:message select="'p-length: ' || $p-length"/>
+
+          <xsl:map-entry key="current-grouping-key()" select="'p', position(), $p-length"/>
+          
+          <xsl:message select="'p: ', $current-p || ' pos:' || position(), current-group()"/>
+          
+          <xsl:for-each-group select="current-group()" group-by="map:get($map_w, .)[2]">
+            <xsl:sort select="xs:integer(substring-before(substring-after(., 'segm_'), '.'))"
+              order="ascending"/>
+            <xsl:sort select="xs:integer(substring-before(substring-after(., '.'), '-'))"
+            order="ascending"/>
+            <xsl:variable name="current-s" select="fn:current-grouping-key()" as="xs:string"/>  <!--xs:untypedAtomic-->
+            <xsl:variable name="s-length" select="
+              sum(for $id in current-group()
+              return
+              string-length(map:get($map_w, $id)[5])) + count(current-group()) - xs:integer($current-p-pos ne 1) -
+              count(fn:filter(current-group(), function ($w-id) {
+              map:get($map_w, $w-id)[4] eq false()
+              }))"/>
+            
+            
+            <xsl:map-entry key="current-grouping-key()" select="'s', position(), $s-length, $current-p"/>
+            
+            <xsl:message select="'s: ',  position(), current-group()"/>
+
+            <xsl:for-each select="current-group()">
+              <xsl:sort select="map:get($map_w, .)[3]" order="ascending"/>
+              <xsl:map-entry key="." select="'w', position(), string-length(map:get($map_w, .)[5]), $current-s, map:get($map_w, .)[4]"/>
+<!--              <xsl:message select="map:get($map_w, .)[5]"/>-->
+            </xsl:for-each>
+
+          </xsl:for-each-group>
+
+          
+
+        </xsl:for-each-group>
+      </xsl:map>
+    </xsl:variable>
+    
+    <xsl:message select="'map_w size: ' || map:size($map_w)"/>
+    <xsl:message select="'map_s-p size: ' || map:size($map_s-p)"/>
     
     <xsl:result-document encoding="UTF-8" method="xml" indent="yes"
       xpath-default-namespace="http://ids-mannheim.de/ns/KorAP" href="{$target}">