before migration from calc_content_length to calc_offsets
Change-Id: I43d17d8350a59f689674499ecf9e6bc171fd9ac9
diff --git a/nkjp2korap.xsl b/nkjp2korap.xsl
index b4909ba..d8babb5 100644
--- a/nkjp2korap.xsl
+++ b/nkjp2korap.xsl
@@ -9,16 +9,19 @@
 <!--           PARAMETERS           -->
 
   <xsl:param name="sourceDir" select="'test/resources/nkjp2korap_sample2'" as="xs:string"/>
-  <!-- the directory containing NKJP files, in the form of a collection of text-level dirs -->
+  <!-- the directory containing NKJP files, in the form of a collection of text-level dirs
+       (that is how we know both the $corpusID and the $docID) -->
 
   <xsl:param name="targetDir" select="'test/output'" as="xs:string"/>
+  <!--  where the corpus/document/text/annotations hierarchy is going to be created -->
   
   <xsl:param name="skip_docID" as="xs:string">
     <xsl:value-of select="'HellerPodgladanie,IsakowiczZaleskiMoje,KolakowskiOco,MysliwskiKamien,WilkWilczy,ZycieWarszawy_Zycie'"/>
   </xsl:param>
   <!--  comma-separated list of document IDs to be skipped from processing
            example: HellerPodgladanie,KOT
-          no functionality beyond string identity is supported  -->
+          no functionality beyond string identity is supported
+          (this is just for testing)  -->
   
 
 <!--          VARIABLES             -->
@@ -28,15 +31,14 @@
 
   <xsl:variable name="targetCorpusDir_slashed" select="$targetDir || '/' || $corpusID || '/'" as="xs:string"/>
 
-  <xsl:variable name="systemDoctypeI5"
-    select="'http://corpora.ids-mannheim.de/I5/DTD/i5.dtd'" as="xs:string"
-    static="true"/>
+  <xsl:variable name="systemDoctypeI5" as="xs:string"
+    select="'http://corpora.ids-mannheim.de/I5/DTD/i5.dtd'" static="true"/>
 
-  <xsl:variable name="publicDoctypeI5" select="'-//IDS//DTD I5 1.0//EN'" as="xs:string"
-    static="true"/>
+  <xsl:variable name="publicDoctypeI5" as="xs:string" static="true"
+    select="'-//IDS//DTD I5 1.0//EN'"/>
 
-  <xsl:variable name="KorAP_namespace" select="'http://ids-mannheim.de/ns/KorAP'" static="true"
-    as="xs:string"/>
+  <xsl:variable name="KorAP_namespace" static="true" as="xs:string"
+    select="'http://ids-mannheim.de/ns/KorAP'"/>
 
   <xsl:variable name="KorAP-XML_version" select="'KorAP-0.4'" as="xs:string" static="true"/>
   <!--  this is only a bit funny -->
@@ -47,6 +49,12 @@
   <!-- see https://www.saxonica.com/documentation11/index.html#!sourcedocs/collections/collection-directories -->
 
   <xsl:variable name="collection_of_text" select="collection($sourceDir || '?' || $collection_params)" as="document-node()+"/>
+  
+<!-- these two 'flags' are meant to increase the readability of the code
+     they are used for the output of the calc_offsets() function, where the 
+     returned value is a sequence, (start, end) -->
+  <xsl:variable name="OFFSET_START" as="xs:integer" static="yes" select="1"/>
+  <xsl:variable name="OFFSET_END" as="xs:integer" static="yes" select="2"/>
 
 
 <!--           MODES               -->
@@ -59,39 +67,225 @@
   <!--           FUNCTIONS             -->
 
   <xsl:function name="f:compute_nesting" as="xs:integer">
-    <xsl:param name="node" as="node()"/>
+    <xsl:param name="node" as="element()"/>
     <xsl:variable name="rel_depth"
       select="count($node/ancestor-or-self::*[local-name(.) ne 'TEI'][local-name(.) ne 'teiCorpus'])"
       as="xs:integer"/>
     <xsl:sequence select="$rel_depth"/>
   </xsl:function>
 
+
+  <xsl:function name="f:calc_offsets" as="xs:integer+">
+    <xsl:param name="node" as="element()"/>
+    <xsl:param name="skip_start" as="xs:boolean" />
+    
+    <xsl:variable name="start" as="xs:integer">
+      <xsl:choose>
+
+        <xsl:when test="$skip_start or $node/self::tei:text or $node/self::tei:body">
+          <xsl:sequence select="0"/>
+        </xsl:when>
+        
+            <!--        handle p -->
+        
+        <xsl:when test="$node/self::tei:p">
+          <xsl:variable name="my_pos" as="xs:integer" select="count($node/preceding-sibling::tei:p) + 1"/>
+          <xsl:variable name="preceding" as="node()*"
+            select="$node/ancestor::tei:body/tei:p[position() lt $my_pos]"/>
+
+          <xsl:choose>
+            <xsl:when test="count($preceding) eq 0">
+              <xsl:sequence select="0"/>
+            </xsl:when>
+            <xsl:otherwise>
+              <xsl:sequence select="sum(f:calc_offsets($preceding[last()],true())[$OFFSET_END],1)"/>
+              
+<!--  BUG danger: I am not sure if a "1" should rather be added after each p; let me try to handle that in the return value of the $length variable,
+              and make it sensitive to the skip_start parameter 
+              
+              I will then have to remove the ",1" from here!
+              
+              -->
+
+<!--              <xsl:variable name="last_corresps"
+                select="$preceding/descendant::tei:s[last()]/(descendant::tei:seg[count(@nkjp:rejected) eq 0 or @nkjp:rejected ne 'true'])[last()]/@corresp"
+                as="attribute(corresp)+"/>
+              <xsl:variable name="end_offsets" as="xs:integer+">
+                <xsl:for-each select="$last_corresps">
+                  <xsl:variable name="numbers"
+                    select="substring-after(substring-before(., ')'), ',')"/>
+                  <xsl:sequence
+                    select="xs:integer(substring-before($numbers, ',')) + xs:integer(substring-after($numbers, ','))"
+                  />
+                </xsl:for-each>
+              </xsl:variable>
+              <xsl:sequence select="sum($end_offsets, 1)"/>
+              
+              this is a non-recursive variant that may turn out to be much less cpu-intensive, not sure
+              - but if it's plugged in, it will have to be adjusted to the current form of the recursive variant,
+                because it hasn't been maintained since it got commented out
+              -->
+            </xsl:otherwise>
+          </xsl:choose>
+        </xsl:when>
+
+        <!--        handle s -->
+
+<!-- the value for s gets counted since the start of the current p 
+        - so we look at the preceding s's
+                      + the preceding p's
+        -->
+        <xsl:when test="$node/self::tei:s">
+          <!--<xsl:variable name="last_corresp"
+            select="$node/descendant::tei:seg[count(@nkjp:rejected) eq 0 or @nkjp:rejected ne 'true'][last()]/attribute::corresp"
+            as="attribute(corresp)"/>
+          <xsl:variable name="numbers"
+            select="substring-after(substring-before($last_corresp, ')'), ',')"/>
+          <xsl:sequence
+            select="xs:integer(substring-before($numbers, ',')) + xs:integer(substring-after($numbers, ','))"
+          />
+          -->
+          
+          <xsl:variable name="internal_start" as="xs:integer">
+            <xsl:variable name="my_pos" as="xs:integer" select="count($node/preceding-sibling::tei:s) + 1"/>
+            <xsl:variable name="preceding" as="node()*"
+              select="$node/ancestor::tei:p[1]/tei:s[position() lt $my_pos]"/>
+
+            <xsl:choose>
+              <xsl:when test="count($preceding) eq 0">
+                <xsl:sequence select="0"/>
+              </xsl:when>
+              <xsl:otherwise>
+                <xsl:sequence select="sum(f:calc_offsets($preceding[last()],true())[$OFFSET_END],1)"/>
+                                                    <!--   again, CAREFUL ABOUT THE +1, it might need to vanish         -->
+              </xsl:otherwise>
+            </xsl:choose>
+          </xsl:variable>
+          
+          <xsl:variable name="external_start" as="xs:integer" select="f:calc_offsets($node/ancestor::tei:p[1],true())"/>
+          
+          <xsl:sequence select="$internal_start + $external_start"/>
+        </xsl:when>
+ 
+        <!--        handle seg -->        
+
+        <xsl:when test="$node/self::tei:seg">
+          <!--   for segs, the s elements are irrelevant, and the local offset is immediately available on the @corresp       -->
+
+          <xsl:variable name="numbers"
+            select="substring-after(substring-before($node/@corresp, ')'), ',')"/>
+
+          <xsl:variable name="internal_start" select="xs:integer(substring-before($numbers, ','))"
+            as="xs:integer"/>
+          <xsl:variable name="external_start" as="xs:integer"
+            select="f:calc_offsets($node/ancestor::tei:p[1], true())"/>
+
+          <xsl:if test="$node/self::tei:seg and count($node/@nkjp:rejected)">
+
+            <xsl:message select="'numbers: ' || $numbers"/>
+          </xsl:if>
+          <xsl:sequence select="$internal_start + $external_start"/>
+        </xsl:when>
+      </xsl:choose>
+    </xsl:variable>
+    
+    <xsl:variable name="length" as="xs:integer">
+    <xsl:choose>
+
+      <xsl:when test="$node/self::tei:text or $node/self::tei:body">
+        <xsl:variable name="last_corresps"
+          select="$node/descendant::tei:p/descendant::tei:s[last()]/(descendant::tei:seg[count(@nkjp:rejected) eq 0 or @nkjp:rejected ne 'true'])[last()]/@corresp"
+          as="attribute(corresp)+"/>
+
+        <xsl:variable name="end_offsets" as="xs:integer+">
+          <xsl:for-each select="$last_corresps">
+            <xsl:variable name="numbers" select="substring-after(substring-before(., ')'), ',')"/>
+            <xsl:sequence
+              select="xs:integer(substring-before($numbers, ',')) + xs:integer(substring-after($numbers, ','))"
+            />
+          </xsl:for-each>
+        </xsl:variable>
+
+        <xsl:sequence select="sum($end_offsets)"/>
+        
+      </xsl:when>
+      <xsl:when test="$node/self::tei:p">
+        <xsl:variable name="last_corresps"
+          select="$node/descendant::tei:s[last()]/(descendant::tei:seg[count(@nkjp:rejected) eq 0 or @nkjp:rejected ne 'true'])[last()]/@corresp"
+          as="attribute(corresp)+"/>
+        <xsl:variable name="end_offsets" as="xs:integer+">
+          <xsl:for-each select="$last_corresps">
+            <xsl:variable name="numbers" select="substring-after(substring-before(., ')'), ',')"/>
+            <xsl:sequence
+              select="xs:integer(substring-before($numbers, ',')) + xs:integer(substring-after($numbers, ','))"
+            />
+          </xsl:for-each>
+        </xsl:variable>
+        <xsl:sequence select="sum($end_offsets)"/>
+      </xsl:when>
+      
+      
+      
+      
+      <xsl:when test="$node/self::tei:s">
+        <xsl:variable name="last_corresp"
+          select="$node/descendant::tei:seg[count(@nkjp:rejected) eq 0 or @nkjp:rejected ne 'true'][last()]/attribute::corresp"
+          as="attribute(corresp)"/>
+        <xsl:variable name="numbers"
+          select="substring-after(substring-before($last_corresp, ')'), ',')"/>
+        <xsl:sequence
+          select="xs:integer(substring-before($numbers, ',')) + xs:integer(substring-after($numbers, ','))"
+        />
+      </xsl:when>
+      <xsl:otherwise>
+        <xsl:variable name="numbers"
+          select="substring-after(substring-before($node/@corresp, ')'), ',')"/>
+        <xsl:if test="$node/self::tei:seg and count($node/@nkjp:rejected)">
+          <!--     REMOVE THIS     -->
+          <xsl:message select="$numbers"/>
+        </xsl:if>
+        <xsl:sequence
+          select="xs:integer(substring-before($numbers, ',')) + xs:integer(substring-after($numbers, ','))"
+        />
+      </xsl:otherwise>
+    </xsl:choose>
+    </xsl:variable>
+
+    <xsl:message select="'length: ' || $length"/>
+    
+    <xsl:sequence select="$start, $start + $length -1"/>
+  </xsl:function>
+
   <xsl:function name="f:calc_content_length" as="xs:integer">
     <xsl:param name="node" as="node()"/>
     <xsl:choose>
       <xsl:when test="$node/self::tei:text or $node/self::tei:body">
         <xsl:variable name="last_corresp"
-          select="$node/descendant::tei:p[last()]/descendant::tei:s[last()]/descendant::tei:seg[last()]/attribute::corresp"
+          select="$node/descendant::tei:p[last()]/descendant::tei:s[last()]/descendant::tei:seg[count(@nkjp:rejected) eq 0 or @nkjp:rejected ne 'true'][last()]/attribute::corresp"
           as="attribute(corresp)"/>
         <xsl:variable name="numbers" select="substring-after(substring-before($last_corresp,')'),',')"/>
         <xsl:sequence select="xs:integer(substring-before($numbers,',')) + xs:integer(substring-after($numbers,','))"/>
       </xsl:when>
       <xsl:when test="$node/self::tei:p">
         <xsl:variable name="last_corresp"
-          select="$node/descendant::tei:s[last()]/descendant::tei:seg[last()]/attribute::corresp"
+          select="$node/descendant::tei:s[last()]/descendant::tei:seg[count(@nkjp:rejected) eq 0 or @nkjp:rejected ne 'true'][last()]/attribute::corresp"
           as="attribute(corresp)"/>
         <xsl:variable name="numbers" select="substring-after(substring-before($last_corresp,')'),',')"/>
         <xsl:sequence select="xs:integer(substring-before($numbers,',')) + xs:integer(substring-after($numbers,','))"/>
       </xsl:when>
       <xsl:when test="$node/self::tei:s">
         <xsl:variable name="last_corresp"
-          select="$node/descendant::tei:seg[last()]/attribute::corresp"
+          select="$node/descendant::tei:seg[count(@nkjp:rejected) eq 0 or @nkjp:rejected ne 'true'][last()]/attribute::corresp"
           as="attribute(corresp)"/>
         <xsl:variable name="numbers" select="substring-after(substring-before($last_corresp,')'),',')"/>
         <xsl:sequence select="xs:integer(substring-before($numbers,',')) + xs:integer(substring-after($numbers,','))"/>
       </xsl:when>
       <xsl:otherwise>
         <xsl:variable name="numbers" select="substring-after(substring-before($node/@corresp,')'),',')"/>
+        <xsl:if test="$node/self::tei:seg and count($node/@nkjp:rejected)">
+                                                                                                                      <!--     REMOVE THIS     -->
+          <xsl:message select="$numbers"/>
+        </xsl:if>
         <xsl:sequence select="xs:integer(substring-before($numbers,',')) + xs:integer(substring-after($numbers,','))"/>
       </xsl:otherwise>
     </xsl:choose>
@@ -122,7 +316,7 @@
 
   <xsl:template name="xsl:initial-template">
     <xsl:variable name="IDs_to_skip" select="tokenize($skip_docID,',')" as="xs:string*"/>
-    
+
     <!-- we only want to call the template below once, and we process a random NKJP corpus file for that purpose, 
       because all we need is the main corpus header, and we can (should) get to that from any NKJP corpus document -->
     <xsl:call-template name="create_corpus_header">
@@ -139,14 +333,17 @@
       <xsl:choose>
         <xsl:when test="$my_textID = $IDs_to_skip"/>
         <xsl:otherwise>
-          <xsl:call-template name="process_single_sample">
+
+          <xsl:message select="f:calc_offsets(doc($ann_segmentation.uri)//tei:body/tei:p[4],false())"/>
+          
+          <!--<xsl:call-template name="process_single_sample">
             <xsl:with-param name="text.xml" as="document-node()" select="."/>
             <xsl:with-param name="ann_morphosyntax.xml" as="document-node()"
               select="doc($ann_morphosyntax.uri)"/>
             <xsl:with-param name="ann_segmentation.xml" as="document-node()"
               select="doc($ann_segmentation.uri)"/>
             <xsl:with-param name="my_textID" select="$my_textID" as="xs:string"/>
-          </xsl:call-template>
+          </xsl:call-template>-->
         </xsl:otherwise>
       </xsl:choose>
     </xsl:for-each>
@@ -156,7 +353,7 @@
     <xsl:param name="text.xml" as="document-node()"/>
     <xsl:param name="ann_morphosyntax.xml" as="document-node()"/>
     <xsl:param name="ann_segmentation.xml" as="document-node()"/>
-    <xsl:param name="my_textID" as="xs:string" select="'0BAD_textID'"/>
+    <xsl:param name="my_textID" as="xs:string" select="'0-BAD_textID'"/>
     
     <xsl:variable name="targetBaseDir" as="xs:string" select="$targetCorpusDir_slashed || $docID || '/' || $my_textID"/>
     
@@ -216,7 +413,8 @@
         </xsl:element>
 
         <xsl:element name="text" namespace="{$KorAP_namespace}">
-          <xsl:value-of select="$text.xml//*[local-name() = 'ab']"/>
+          <!--<xsl:value-of select="$text.xml//*[local-name() = 'ab']"/>-->
+          <xsl:apply-templates select="$text.xml//*[local-name() = 'ab']"/>
         </xsl:element>
       </xsl:element>
     </xsl:result-document>