stylesheet redone for handling larger datasets; just struct and morpho for now, though

Change-Id: I708d2f0f2c2061428699b09c70998ada016b36d4
diff --git a/nkjp2korap.xsl b/nkjp2korap.xsl
index c70a01d..4bfdb7a 100644
--- a/nkjp2korap.xsl
+++ b/nkjp2korap.xsl
@@ -5,17 +5,18 @@
   xmlns:map="http://www.w3.org/2005/xpath-functions/map" exclude-result-prefixes="xs f map nkjp tei"
   version="3.0" expand-text="yes">
 
-  <xsl:param name="corpusID" as="xs:string"/>
-  <xsl:param name="docID" as="xs:string"/>
-  <xsl:param name="textID" as="xs:string"/>
+
+<!--           PARAMETERS           -->
 
   <xsl:param name="sourceDir" select="'test/resources/nkjp2korap_sample2'" as="xs:string"/>
   <!-- the directory containing NKJP files, in the form of a collection of text-level dirs -->
 
   <xsl:param name="targetDir" select="'test/output'" as="xs:string"/>
 
-  <xsl:variable name="targetTextDir_slashed"
-    select="$targetDir || '/' || $corpusID || '/' || $docID || '/' || $textID || '/'" as="xs:string"/>
+<!--          VARIABLES             -->
+
+  <xsl:variable name="corpusID" as="xs:string" select="'NKJP'" static="yes"/>
+  <xsl:variable name="docID" as="xs:string" select="'NKJP'" static="yes"/>
 
   <xsl:variable name="targetCorpusDir_slashed" select="$targetDir || '/' || $corpusID || '/'" as="xs:string"/>
 
@@ -32,17 +33,22 @@
   <xsl:variable name="KorAP-XML_version" select="'KorAP-0.4'" as="xs:string" static="true"/>
   <!--  this is only a bit funny -->
 
-  <xsl:variable name="compoundID" as="xs:string"
-    select="$corpusID || '_' || $docID || '.' || $textID"/>
-  <!--  this is what occurs in the text and data layers as @docid -->
+  <xsl:variable name="collection_params" as="xs:string" static="yes"
+    select="'recurse=yes;validation=strip;select=text.xml;content-type=application/xml;on-error=warning;xinclude=yes'"
+  />
+  <!-- see https://www.saxonica.com/documentation11/index.html#!sourcedocs/collections/collection-directories -->
+
+  <xsl:variable name="collection_of_text" select="collection($sourceDir || '?' || $collection_params)" as="document-node()+"/>
+  
+<!--           MODES               -->
 
   <xsl:mode name="corpus" on-no-match="deep-skip"/>
   <xsl:mode name="text" on-no-match="deep-skip"/>
   <xsl:mode name="header-text" on-no-match="text-only-copy"/>
 
-  <!--  <xsl:variable name="text_depth" as="xs:integer" select="xs:integer('2')" static="true"/>
-  <!-\-  this magic number indicates the depth of the <TEI> element inside teiCorpus/TEI -\->
--->
+
+  <!--           FUNCTIONS             -->
+
   <xsl:function name="f:compute_nesting" as="xs:integer">
     <xsl:param name="node" as="node()"/>
     <xsl:variable name="rel_depth"
@@ -86,57 +92,84 @@
   <xsl:template match="@default" mode="#all"/>
   <!--  this is to delete some auto-inserted attribute throughout -->
 
-  <xsl:variable name="collection_params" as="xs:string" static="yes"
-    select="'recurse=yes;validation=strip;select=text.xml;content-type=application/xml;on-error=warning;xinclude=yes'"
-  />
+  <xsl:template match="tei:w" mode="#all"/>
+<!-- NKJP-SGJP has apparently resigned from standoff representations by adding <w> everywhere;
+     for the time being, we'll just stick to the standoff offsets, although that may need to 
+     be revisited as the NKJP format has now began to stray from its schemas and assumptions -->
 
-<xsl:variable name="collection_of_text" select="collection($sourceDir || '?' || $collection_params)" as="document-node()+"/>
+  <xsl:template match="tei:choice" mode="#all"/>
+<!--  THIS IS ONLY TEMPORARY, 
+    because an interesting challenge came up where I will 
+    probably have to abandon straightforward mapping because of TOKENIZATION alternatives;
+  
+  but now, I just want this stylesheet to work, even if it eats some occasional token (which it now does, 'komuĊ›' and 'czym' vanish)
+  -->
 
   <xsl:template name="xsl:initial-template">
-    <xsl:variable name="text.xml" select="$collection_of_text[1]"/>
-
-    <!-- we only want to call this once, and we process a random NKJP corpus file for that purpose, 
+    <!-- we only want to call the template below once, and we process a random NKJP corpus file for that purpose, 
       because all we need is the main corpus header, and we can (should) get to that from any NKJP corpus document -->
     <xsl:call-template name="create_corpus_header">
-      <xsl:with-param name="text.xml" select="$text.xml" as="document-node()"/>
+      <xsl:with-param name="text.xml" select="$collection_of_text[1]" as="document-node()"/>
       <xsl:with-param name="target" select="$targetCorpusDir_slashed || 'header.xml'" as="xs:string"/>
     </xsl:call-template>
 
-
+    <xsl:for-each select="$collection_of_text">
+      <xsl:variable name="my_dir" as="xs:string" select="replace(base-uri(),'/text\.xml','')"/>
+      <xsl:variable name="my_textID" as="xs:string" select="tokenize($my_dir,'/')[last()]"/>
+      <xsl:variable name="ann_morphosyntax.uri" select="$my_dir || '/ann_morphosyntax.xml'" as="xs:string"/>
+      <xsl:variable name="ann_segmentation.uri" select="$my_dir || '/ann_segmentation.xml'" as="xs:string"/>
+      
+      <xsl:call-template name="process_single_sample">
+        <xsl:with-param name="text.xml" as="document-node()" select="."/>
+        <xsl:with-param name="ann_morphosyntax.xml" as="document-node()" select="doc($ann_morphosyntax.uri)"/>
+        <xsl:with-param name="ann_segmentation.xml" as="document-node()" select="doc($ann_segmentation.uri)"/>
+        <xsl:with-param name="my_textID" select="$my_textID" as="xs:string"/>
+      </xsl:call-template>
+    </xsl:for-each>
   </xsl:template>
   
   <xsl:template name="process_single_sample">
-    <xsl:variable name="text.xml" as="document-node()" select="doc($sourceDir || '/text.xml')"/>
-    <xsl:variable name="ann_morphosyntax.xml" as="document-node()"
-      select="doc($sourceDir || '/ann_morphosyntax.xml')"/>
-    <xsl:variable name="ann_segmentation.xml" as="document-node()"
-      select="doc($sourceDir || '/ann_segmentation.xml')"/>
-
+    <xsl:param name="text.xml" as="document-node()"/>
+    <xsl:param name="ann_morphosyntax.xml" as="document-node()"/>
+    <xsl:param name="ann_segmentation.xml" as="document-node()"/>
+    <xsl:param name="my_textID" as="xs:string" select="'0BAD_textID'"/>
+    
+    <xsl:variable name="targetBaseDir" as="xs:string" select="$targetCorpusDir_slashed || $docID || '/' || $my_textID"/>
+    
+    <xsl:variable name="compoundID" as="xs:string"
+      select="$corpusID || '_' || $docID || '.' || $my_textID"/>
+    <!--  this is what occurs in the text and data layers as @docid -->
+    
+    
     <xsl:call-template name="create_data">
       <xsl:with-param name="text.xml" select="$text.xml" as="document-node()"/>
-      <xsl:with-param name="target" select="$targetTextDir_slashed || 'data.xml'" as="xs:string"/>
+      <xsl:with-param name="compoundID" select="$compoundID" as="xs:string"/>
+      <xsl:with-param name="target" select="$targetBaseDir ||  '/data.xml'" as="xs:string"/>
     </xsl:call-template>
 
     <xsl:call-template name="create_struct">
       <xsl:with-param name="text.xml" select="$text.xml" as="document-node()"/>
+      <xsl:with-param name="compoundID" select="$compoundID" as="xs:string"/>
       <xsl:with-param name="ann_segmentation.xml" select="$ann_segmentation.xml"
         as="document-node()"/>
-      <xsl:with-param name="target" select="$targetTextDir_slashed || 'struct/structure.xml'" as="xs:string"
+      <xsl:with-param name="target" select="$targetBaseDir ||  '/struct/structure.xml'" as="xs:string"
       />
     </xsl:call-template>
 
     <xsl:call-template name="create_morpho">
       <xsl:with-param name="text.xml" select="$text.xml" as="document-node()"/>
+      <xsl:with-param name="compoundID" select="$compoundID" as="xs:string"/>
       <xsl:with-param name="ann_segmentation.xml" select="$ann_segmentation.xml"
         as="document-node()"/>
       <xsl:with-param name="ann_morphosyntax.xml" select="$ann_morphosyntax.xml"
         as="document-node()"/>
-      <xsl:with-param name="target" select="$targetTextDir_slashed || 'nkjp/morpho.xml'" as="xs:string"/>
+      <xsl:with-param name="target" select="$targetBaseDir ||  '/nkjp/morpho.xml'" as="xs:string"/>
     </xsl:call-template>
 
     <xsl:call-template name="create_text_header">
       <xsl:with-param name="text.xml" select="$text.xml" as="document-node()"/>
-      <xsl:with-param name="target" select="$targetTextDir_slashed || 'header.xml'" as="xs:string"/>
+      <xsl:with-param name="compoundID" select="$compoundID" as="xs:string"/>
+      <xsl:with-param name="target" select="$targetBaseDir ||  '/header.xml'" as="xs:string"/>
     </xsl:call-template>
 
   </xsl:template>
@@ -145,6 +178,7 @@
 
   <xsl:template name="create_data">
     <xsl:param name="text.xml" as="document-node()"/>
+    <xsl:param name="compoundID" as="xs:string"/>
     <xsl:param name="target" as="xs:string"/>
     <!-- create the data.xml file -->
     <xsl:result-document encoding="UTF-8" method="xml" indent="yes"
@@ -168,6 +202,7 @@
 
   <xsl:template name="create_struct">
     <xsl:param name="text.xml" as="document-node()"/>
+    <xsl:param name="compoundID" as="xs:string"/>
     <xsl:param name="ann_segmentation.xml" as="document-node()"/>
     <xsl:param name="target" as="xs:string"/>
 
@@ -284,6 +319,7 @@
 
   <xsl:template name="create_morpho">
     <xsl:param name="text.xml" as="document-node()"/>
+    <xsl:param name="compoundID" as="xs:string"/>
     <xsl:param name="ann_segmentation.xml" as="document-node()"/>
     <xsl:param name="ann_morphosyntax.xml" as="document-node()"/>
     <xsl:param name="target" as="xs:string"/>
@@ -388,11 +424,11 @@
     <xsl:variable name="chosen-msd" as="xs:string" select="$my_choice-lex/descendant::tei:symbol[@xml:id eq $my_choice-id]/@value"/>
     <xsl:variable name="preceding" select="preceding-sibling::*[local-name(.) eq $my_name]"/>
     <xsl:variable name="preceding-count" select="count($preceding)"/>
-    <xsl:variable name="outside-preceding-count" as="xs:integer">
+    <!--<xsl:variable name="outside-preceding-count" as="xs:integer">
       <xsl:choose>
-        <xsl:when test="self::tei:s or self::tei:p">
+        <xsl:when test="self::tei:s or self::tei:p">   <!-\- THIS NEEDS TO BE REVISITED AFTER THIS TEMPLATE HAS BECOME MORE SPECIFIC -\->
           <xsl:choose>
-            <xsl:when test="$preceding-count">
+            <xsl:when test="$preceding-count">                            commented out for now
               <xsl:sequence select="
                 sum(for $p in $preceding
                 return
@@ -407,9 +443,8 @@
           <xsl:sequence select="0"/>
         </xsl:otherwise>
       </xsl:choose>
-    </xsl:variable>
-    <xsl:variable name="my_index" select="$index + 1 + $preceding-count + $outside-preceding-count"
-      as="xs:integer"/>
+    </xsl:variable>-->
+    <xsl:variable name="my_index" select="$index + 1 + $preceding-count" as="xs:integer"/>
     
     <xsl:variable name="start" as="xs:integer">
       <xsl:variable name="numbers" select="substring-after(substring-before(@corresp,')'),',')"/>
@@ -454,17 +489,19 @@
         </xsl:element>
       </xsl:element>
     </xsl:element>
-    <xsl:apply-templates mode="morpho">
+      <xsl:apply-templates mode="morpho">
       <xsl:with-param name="ini" select="$start" as="xs:integer"/>
       <xsl:with-param name="fin" select="$end" as="xs:integer"/>
       <xsl:with-param name="index" select="$my_index"/>
       <xsl:with-param name="ann_morphosyntax.xml" select="$ann_morphosyntax.xml" as="document-node()"/>
-    </xsl:apply-templates>
+    </xsl:apply-templates>-->
   </xsl:template>
+
   <!--   **************************        TEXT header      *******************                -->
 
   <xsl:template name="create_text_header">
     <xsl:param name="text.xml" as="document-node()"/>
+    <xsl:param name="compoundID" as="xs:string"/>
     <xsl:param name="target" as="xs:string"/>
 
     <!-- create the local header.xml file -->
@@ -472,7 +509,9 @@
       xpath-default-namespace="http://ids-mannheim.de/ns/KorAP" href="{$target}">
 
       <idsHeader type="document" pattern="text" status="new" version="1.1" TEIform="teiHeader">
-        <xsl:apply-templates select="$text.xml//tei:TEI/tei:teiHeader/tei:*" mode="text"/>
+        <xsl:apply-templates select="$text.xml//tei:TEI/tei:teiHeader/tei:*" mode="text">
+          <xsl:with-param name="compoundID" as="xs:string" select="$compoundID" tunnel="yes"/>
+        </xsl:apply-templates>
       </idsHeader>
     </xsl:result-document>
   </xsl:template>
@@ -490,9 +529,10 @@
   </xsl:template>
 
   <xsl:template match="tei:titleStmt" mode="text">
+    <xsl:param name="compoundID" as="xs:string" tunnel="yes"/>
     <titleStmt>
       <textSigle>
-        <xsl:value-of select="$corpusID || '/' || $docID || '.' || $textID"/>
+        <xsl:value-of select="$compoundID"/>
       </textSigle>
       <xsl:apply-templates mode="text"/>
     </titleStmt>