begin the switch from text.xml to ann_segmentation.xml; for now, data.xml is properly created (whitespace and tokenization alternatives). A lot of code cleanup has not yet happened.

Change-Id: Ib8ea509971adff46946fc803e053f6389ec49f2d
diff --git a/nkjp2korap.xsl b/nkjp2korap.xsl
index d8babb5..77e070a 100644
--- a/nkjp2korap.xsl
+++ b/nkjp2korap.xsl
@@ -24,7 +24,7 @@
           (this is just for testing)  -->
   
 
-<!--          VARIABLES             -->
+<!--          VARIABLES (= constants...)            -->
 
   <xsl:variable name="corpusID" as="xs:string" select="'NKJP'" static="yes"/>
   <xsl:variable name="docID" as="xs:string" select="'NKJP'" static="yes"/>
@@ -74,6 +74,27 @@
     <xsl:sequence select="$rel_depth"/>
   </xsl:function>
 
+<xsl:function name="f:is_preceded_by_ws" as="xs:boolean">
+    <xsl:param name="node" as="element()"/>
+  <xsl:choose>
+    <xsl:when test="local-name($node) eq 'seg'">
+        <xsl:sequence select="not(exists($node/@nkjp:nps))"/>
+    </xsl:when>
+    <xsl:when test="local-name($node) eq 's'">
+        <xsl:message select="'s - prec s:  ' || $node/preceding-sibling::tei:s"/>
+      <xsl:message select="'same s - prec p:  ' || $node/ancestor::tei:p[1]/preceding-sibling::tei:p ||  '&#10;'"/>
+      
+      <xsl:sequence select="exists($node/preceding-sibling::tei:s) or exists($node/ancestor::tei:p[1]/preceding-sibling::tei:p)"/>
+    </xsl:when>
+    <xsl:when test="local-name($node) eq 'p'">
+      <xsl:message select="'p : ' || $node/preceding-sibling::tei:p"></xsl:message>
+      <xsl:sequence select="exists($node/preceding-sibling::tei:p)"/>
+    </xsl:when>
+    <xsl:otherwise>
+      <xsl:message terminate="yes" select="'Wrong argument passed to f:is_preceded_by_ws(): ' || local-name($node) || ' Only p, s, seg are allowed.'"></xsl:message>
+    </xsl:otherwise>
+  </xsl:choose>
+</xsl:function>
 
   <xsl:function name="f:calc_offsets" as="xs:integer+">
     <xsl:param name="node" as="element()"/>
@@ -156,13 +177,14 @@
                 <xsl:sequence select="0"/>
               </xsl:when>
               <xsl:otherwise>
-                <xsl:sequence select="sum(f:calc_offsets($preceding[last()],true())[$OFFSET_END],1)"/>
+                <xsl:sequence select="f:calc_offsets($preceding[last()],true())[$OFFSET_END]"/>
+                <!--<xsl:sequence select="sum(f:calc_offsets($preceding[last()],true())[$OFFSET_END],1)"/>-->
                                                     <!--   again, CAREFUL ABOUT THE +1, it might need to vanish         -->
               </xsl:otherwise>
             </xsl:choose>
           </xsl:variable>
           
-          <xsl:variable name="external_start" as="xs:integer" select="f:calc_offsets($node/ancestor::tei:p[1],true())"/>
+          <xsl:variable name="external_start" as="xs:integer" select="f:calc_offsets($node/ancestor::tei:p[1],false())[$OFFSET_START]"/>
           
           <xsl:sequence select="$internal_start + $external_start"/>
         </xsl:when>
@@ -178,7 +200,7 @@
           <xsl:variable name="internal_start" select="xs:integer(substring-before($numbers, ','))"
             as="xs:integer"/>
           <xsl:variable name="external_start" as="xs:integer"
-            select="f:calc_offsets($node/ancestor::tei:p[1], true())"/>
+            select="f:calc_offsets($node/ancestor::tei:p[1], false())[$OFFSET_START]"/>
 
           <xsl:if test="$node/self::tei:seg and count($node/@nkjp:rejected)">
 
@@ -242,7 +264,7 @@
           select="substring-after(substring-before($node/@corresp, ')'), ',')"/>
         <xsl:if test="$node/self::tei:seg and count($node/@nkjp:rejected)">
           <!--     REMOVE THIS     -->
-          <xsl:message select="$numbers"/>
+          <xsl:message select="'rejected: ' || $numbers"/>
         </xsl:if>
         <xsl:sequence
           select="xs:integer(substring-before($numbers, ',')) + xs:integer(substring-after($numbers, ','))"
@@ -251,9 +273,9 @@
     </xsl:choose>
     </xsl:variable>
 
-    <xsl:message select="'length: ' || $length"/>
+    <xsl:message select="local-name($node) || '[' || count($node/preceding-sibling::*[local-name() eq local-name($node)])+1 || '] length: ' || $length || '    skip_start: ' || $skip_start"/>
     
-    <xsl:sequence select="$start, $start + $length -1"/>
+    <xsl:sequence select="$start, $start + $length -1 + xs:integer($skip_start)"/>
   </xsl:function>
 
   <xsl:function name="f:calc_content_length" as="xs:integer">
@@ -297,7 +319,7 @@
   <xsl:template match="@default" mode="#all"/>
   <!--  this is to delete some auto-inserted attribute throughout -->
 
-  <xsl:template match="tei:w" mode="#all"/>
+  <!--<xsl:template match="tei:w" mode="#all"/> w is better than ab, now ... -->
 <!-- NKJP-SGJP has apparently resigned from standoff representations by adding <w> everywhere;
      for the time being, we'll just stick to the standoff offsets, although that may need to 
      be revisited as the NKJP format has now began to stray from its schemas and assumptions -->
@@ -332,18 +354,23 @@
       
       <xsl:choose>
         <xsl:when test="$my_textID = $IDs_to_skip"/>
+        <!--  this is a utility step, for when we want to ignore some texts for any reason (debugging, selective update)     -->
+        
         <xsl:otherwise>
 
-          <xsl:message select="f:calc_offsets(doc($ann_segmentation.uri)//tei:body/tei:p[4],false())"/>
+          <!--<xsl:message select="f:calc_offsets(doc($ann_segmentation.uri)//tei:body/tei:p[1]/tei:s[2]/tei:seg[1],false())"/>-->
           
-          <!--<xsl:call-template name="process_single_sample">
+<!--          <xsl:message select="doc($ann_segmentation.uri)//tei:body/tei:p[1]/tei:s[1]/tei:seg[1] || f:is_preceded_by_ws(doc($ann_segmentation.uri)//tei:body/tei:p[1]/tei:s[1]/tei:seg[1])"/>
+          <xsl:message select="doc($ann_segmentation.uri)//tei:body/tei:p[1]/tei:s[1]/tei:seg[3] || f:is_preceded_by_ws(doc($ann_segmentation.uri)//tei:body/tei:p[1]/tei:s[1]/tei:seg[3])"/>
+-->          
+          <xsl:call-template name="process_single_sample">
             <xsl:with-param name="text.xml" as="document-node()" select="."/>
             <xsl:with-param name="ann_morphosyntax.xml" as="document-node()"
               select="doc($ann_morphosyntax.uri)"/>
             <xsl:with-param name="ann_segmentation.xml" as="document-node()"
               select="doc($ann_segmentation.uri)"/>
             <xsl:with-param name="my_textID" select="$my_textID" as="xs:string"/>
-          </xsl:call-template>-->
+          </xsl:call-template>
         </xsl:otherwise>
       </xsl:choose>
     </xsl:for-each>
@@ -354,6 +381,7 @@
     <xsl:param name="ann_morphosyntax.xml" as="document-node()"/>
     <xsl:param name="ann_segmentation.xml" as="document-node()"/>
     <xsl:param name="my_textID" as="xs:string" select="'0-BAD_textID'"/>
+    <!-- empty textID should never happen, but if it does, it will be signalled at the top of the output   -->
     
     <xsl:variable name="targetBaseDir" as="xs:string" select="$targetCorpusDir_slashed || $docID || '/' || $my_textID"/>
     
@@ -363,12 +391,13 @@
     
     
     <xsl:call-template name="create_data">
-      <xsl:with-param name="text.xml" select="$text.xml" as="document-node()"/>
+      <!--<xsl:with-param name="text.xml" select="$text.xml" as="document-node()"/>-->
+      <xsl:with-param name="ann_segmentation.xml" select="$ann_segmentation.xml" as="document-node()"/>
       <xsl:with-param name="compoundID" select="$compoundID" as="xs:string"/>
       <xsl:with-param name="target" select="$targetBaseDir ||  '/data.xml'" as="xs:string"/>
     </xsl:call-template>
 
-    <xsl:call-template name="create_struct">
+<!--    <xsl:call-template name="create_struct">
       <xsl:with-param name="text.xml" select="$text.xml" as="document-node()"/>
       <xsl:with-param name="compoundID" select="$compoundID" as="xs:string"/>
       <xsl:with-param name="ann_segmentation.xml" select="$ann_segmentation.xml"
@@ -376,8 +405,8 @@
       <xsl:with-param name="target" select="$targetBaseDir ||  '/struct/structure.xml'" as="xs:string"
       />
     </xsl:call-template>
-
-    <xsl:call-template name="create_morpho">
+-->
+<!--    <xsl:call-template name="create_morpho">
       <xsl:with-param name="text.xml" select="$text.xml" as="document-node()"/>
       <xsl:with-param name="compoundID" select="$compoundID" as="xs:string"/>
       <xsl:with-param name="ann_segmentation.xml" select="$ann_segmentation.xml"
@@ -386,7 +415,7 @@
         as="document-node()"/>
       <xsl:with-param name="target" select="$targetBaseDir ||  '/nkjp/morpho.xml'" as="xs:string"/>
     </xsl:call-template>
-
+-->
     <xsl:call-template name="create_text_header">
       <xsl:with-param name="text.xml" select="$text.xml" as="document-node()"/>
       <xsl:with-param name="compoundID" select="$compoundID" as="xs:string"/>
@@ -398,7 +427,8 @@
   <!--   **************************        data.xml       *******************                -->
 
   <xsl:template name="create_data">
-    <xsl:param name="text.xml" as="document-node()"/>
+    <!--<xsl:param name="text.xml" as="document-node()"/>-->
+    <xsl:param name="ann_segmentation.xml" as="document-node()"/>
     <xsl:param name="compoundID" as="xs:string"/>
     <xsl:param name="target" as="xs:string"/>
     <!-- create the data.xml file -->
@@ -413,8 +443,20 @@
         </xsl:element>
 
         <xsl:element name="text" namespace="{$KorAP_namespace}">
-          <!--<xsl:value-of select="$text.xml//*[local-name() = 'ab']"/>-->
-          <xsl:apply-templates select="$text.xml//*[local-name() = 'ab']"/>
+          <!--<xsl:value-of select="$text.xml//*[local-name() = 'ab']"/>    preserves more whitespace -->
+          <!--<xsl:apply-templates select="$text.xml//*[local-name() = 'ab']"/>-->
+          <xsl:variable name="content" as="xs:string+">
+            <xsl:variable name="segs" select="$ann_segmentation.xml/tei:teiCorpus/tei:TEI/tei:text/tei:body/tei:p/tei:s//tei:seg[count(@nkjp:rejected) eq 0]" as="element()+"/>
+            <xsl:sequence select="head($segs)/tei:w"/>
+            <xsl:for-each select="tail($segs)">
+              <xsl:sequence select="
+                  if (f:is_preceded_by_ws(.)) then
+                    ' '
+                  else
+                    '', ./tei:w"/>
+            </xsl:for-each>
+          </xsl:variable>
+          <xsl:value-of select="string-join($content)"/>
         </xsl:element>
       </xsl:element>
     </xsl:result-document>
@@ -446,12 +488,12 @@
     <xsl:param name="ini" as="xs:integer" required="no" select="0"/>
     <xsl:param name="fin" as="xs:integer" required="no" select="999999999"/>
     <xsl:param name="index" as="xs:integer" required="no" select="1"/>
-    <!-- I have made a major mess here, but it works... it's so spread out 
-      because I wanted to make sure to be able to look up the individual 
+    <!-- It's so spread out because I want to make sure to be able to look up the individual 
       constituent values, should anything go wrong; optimization will come when it's worked against a larger dataset   -->
     <xsl:variable name="my_name" select="local-name()" as="xs:string"/>
     <xsl:variable name="preceding" select="preceding-sibling::*[local-name(.) eq $my_name]"/>
     <xsl:variable name="preceding-count" select="count($preceding)"/>
+
     <xsl:variable name="outside-preceding-count" as="xs:integer">
       <xsl:choose>
         <xsl:when test="self::tei:s or self::tei:p">
@@ -472,6 +514,7 @@
         </xsl:otherwise>
       </xsl:choose>
     </xsl:variable>
+
     <xsl:variable name="my_index" select="$index + 1 + $preceding-count + $outside-preceding-count"
       as="xs:integer"/>