stylesheet redone for handling larger datasets; just struct and morpho for now, though
Change-Id: I708d2f0f2c2061428699b09c70998ada016b36d4
diff --git a/nkjp2korap.xsl b/nkjp2korap.xsl
index c70a01d..4bfdb7a 100644
--- a/nkjp2korap.xsl
+++ b/nkjp2korap.xsl
@@ -5,17 +5,18 @@
xmlns:map="http://www.w3.org/2005/xpath-functions/map" exclude-result-prefixes="xs f map nkjp tei"
version="3.0" expand-text="yes">
- <xsl:param name="corpusID" as="xs:string"/>
- <xsl:param name="docID" as="xs:string"/>
- <xsl:param name="textID" as="xs:string"/>
+
+<!-- PARAMETERS -->
<xsl:param name="sourceDir" select="'test/resources/nkjp2korap_sample2'" as="xs:string"/>
<!-- the directory containing NKJP files, in the form of a collection of text-level dirs -->
<xsl:param name="targetDir" select="'test/output'" as="xs:string"/>
- <xsl:variable name="targetTextDir_slashed"
- select="$targetDir || '/' || $corpusID || '/' || $docID || '/' || $textID || '/'" as="xs:string"/>
+<!-- VARIABLES -->
+
+ <xsl:variable name="corpusID" as="xs:string" select="'NKJP'" static="yes"/>
+ <xsl:variable name="docID" as="xs:string" select="'NKJP'" static="yes"/>
<xsl:variable name="targetCorpusDir_slashed" select="$targetDir || '/' || $corpusID || '/'" as="xs:string"/>
@@ -32,17 +33,22 @@
<xsl:variable name="KorAP-XML_version" select="'KorAP-0.4'" as="xs:string" static="true"/>
<!-- this is only a bit funny -->
- <xsl:variable name="compoundID" as="xs:string"
- select="$corpusID || '_' || $docID || '.' || $textID"/>
- <!-- this is what occurs in the text and data layers as @docid -->
+ <xsl:variable name="collection_params" as="xs:string" static="yes"
+ select="'recurse=yes;validation=strip;select=text.xml;content-type=application/xml;on-error=warning;xinclude=yes'"
+ />
+ <!-- see https://www.saxonica.com/documentation11/index.html#!sourcedocs/collections/collection-directories -->
+
+ <xsl:variable name="collection_of_text" select="collection($sourceDir || '?' || $collection_params)" as="document-node()+"/>
+
+<!-- MODES -->
<xsl:mode name="corpus" on-no-match="deep-skip"/>
<xsl:mode name="text" on-no-match="deep-skip"/>
<xsl:mode name="header-text" on-no-match="text-only-copy"/>
- <!-- <xsl:variable name="text_depth" as="xs:integer" select="xs:integer('2')" static="true"/>
- <!-\- this magic number indicates the depth of the <TEI> element inside teiCorpus/TEI -\->
--->
+
+ <!-- FUNCTIONS -->
+
<xsl:function name="f:compute_nesting" as="xs:integer">
<xsl:param name="node" as="node()"/>
<xsl:variable name="rel_depth"
@@ -86,57 +92,84 @@
<xsl:template match="@default" mode="#all"/>
<!-- this is to delete some auto-inserted attribute throughout -->
- <xsl:variable name="collection_params" as="xs:string" static="yes"
- select="'recurse=yes;validation=strip;select=text.xml;content-type=application/xml;on-error=warning;xinclude=yes'"
- />
+ <xsl:template match="tei:w" mode="#all"/>
+<!-- NKJP-SGJP has apparently resigned from standoff representations by adding <w> everywhere;
+ for the time being, we'll just stick to the standoff offsets, although that may need to
+ be revisited as the NKJP format has now began to stray from its schemas and assumptions -->
-<xsl:variable name="collection_of_text" select="collection($sourceDir || '?' || $collection_params)" as="document-node()+"/>
+ <xsl:template match="tei:choice" mode="#all"/>
+<!-- THIS IS ONLY TEMPORARY,
+ because an interesting challenge came up where I will
+ probably have to abandon straightforward mapping because of TOKENIZATION alternatives;
+
+ but now, I just want this stylesheet to work, even if it eats some occasional token (which it now does, 'komuĊ' and 'czym' vanish)
+ -->
<xsl:template name="xsl:initial-template">
- <xsl:variable name="text.xml" select="$collection_of_text[1]"/>
-
- <!-- we only want to call this once, and we process a random NKJP corpus file for that purpose,
+ <!-- we only want to call the template below once, and we process a random NKJP corpus file for that purpose,
because all we need is the main corpus header, and we can (should) get to that from any NKJP corpus document -->
<xsl:call-template name="create_corpus_header">
- <xsl:with-param name="text.xml" select="$text.xml" as="document-node()"/>
+ <xsl:with-param name="text.xml" select="$collection_of_text[1]" as="document-node()"/>
<xsl:with-param name="target" select="$targetCorpusDir_slashed || 'header.xml'" as="xs:string"/>
</xsl:call-template>
-
+ <xsl:for-each select="$collection_of_text">
+ <xsl:variable name="my_dir" as="xs:string" select="replace(base-uri(),'/text\.xml','')"/>
+ <xsl:variable name="my_textID" as="xs:string" select="tokenize($my_dir,'/')[last()]"/>
+ <xsl:variable name="ann_morphosyntax.uri" select="$my_dir || '/ann_morphosyntax.xml'" as="xs:string"/>
+ <xsl:variable name="ann_segmentation.uri" select="$my_dir || '/ann_segmentation.xml'" as="xs:string"/>
+
+ <xsl:call-template name="process_single_sample">
+ <xsl:with-param name="text.xml" as="document-node()" select="."/>
+ <xsl:with-param name="ann_morphosyntax.xml" as="document-node()" select="doc($ann_morphosyntax.uri)"/>
+ <xsl:with-param name="ann_segmentation.xml" as="document-node()" select="doc($ann_segmentation.uri)"/>
+ <xsl:with-param name="my_textID" select="$my_textID" as="xs:string"/>
+ </xsl:call-template>
+ </xsl:for-each>
</xsl:template>
<xsl:template name="process_single_sample">
- <xsl:variable name="text.xml" as="document-node()" select="doc($sourceDir || '/text.xml')"/>
- <xsl:variable name="ann_morphosyntax.xml" as="document-node()"
- select="doc($sourceDir || '/ann_morphosyntax.xml')"/>
- <xsl:variable name="ann_segmentation.xml" as="document-node()"
- select="doc($sourceDir || '/ann_segmentation.xml')"/>
-
+ <xsl:param name="text.xml" as="document-node()"/>
+ <xsl:param name="ann_morphosyntax.xml" as="document-node()"/>
+ <xsl:param name="ann_segmentation.xml" as="document-node()"/>
+ <xsl:param name="my_textID" as="xs:string" select="'0BAD_textID'"/>
+
+ <xsl:variable name="targetBaseDir" as="xs:string" select="$targetCorpusDir_slashed || $docID || '/' || $my_textID"/>
+
+ <xsl:variable name="compoundID" as="xs:string"
+ select="$corpusID || '_' || $docID || '.' || $my_textID"/>
+ <!-- this is what occurs in the text and data layers as @docid -->
+
+
<xsl:call-template name="create_data">
<xsl:with-param name="text.xml" select="$text.xml" as="document-node()"/>
- <xsl:with-param name="target" select="$targetTextDir_slashed || 'data.xml'" as="xs:string"/>
+ <xsl:with-param name="compoundID" select="$compoundID" as="xs:string"/>
+ <xsl:with-param name="target" select="$targetBaseDir || '/data.xml'" as="xs:string"/>
</xsl:call-template>
<xsl:call-template name="create_struct">
<xsl:with-param name="text.xml" select="$text.xml" as="document-node()"/>
+ <xsl:with-param name="compoundID" select="$compoundID" as="xs:string"/>
<xsl:with-param name="ann_segmentation.xml" select="$ann_segmentation.xml"
as="document-node()"/>
- <xsl:with-param name="target" select="$targetTextDir_slashed || 'struct/structure.xml'" as="xs:string"
+ <xsl:with-param name="target" select="$targetBaseDir || '/struct/structure.xml'" as="xs:string"
/>
</xsl:call-template>
<xsl:call-template name="create_morpho">
<xsl:with-param name="text.xml" select="$text.xml" as="document-node()"/>
+ <xsl:with-param name="compoundID" select="$compoundID" as="xs:string"/>
<xsl:with-param name="ann_segmentation.xml" select="$ann_segmentation.xml"
as="document-node()"/>
<xsl:with-param name="ann_morphosyntax.xml" select="$ann_morphosyntax.xml"
as="document-node()"/>
- <xsl:with-param name="target" select="$targetTextDir_slashed || 'nkjp/morpho.xml'" as="xs:string"/>
+ <xsl:with-param name="target" select="$targetBaseDir || '/nkjp/morpho.xml'" as="xs:string"/>
</xsl:call-template>
<xsl:call-template name="create_text_header">
<xsl:with-param name="text.xml" select="$text.xml" as="document-node()"/>
- <xsl:with-param name="target" select="$targetTextDir_slashed || 'header.xml'" as="xs:string"/>
+ <xsl:with-param name="compoundID" select="$compoundID" as="xs:string"/>
+ <xsl:with-param name="target" select="$targetBaseDir || '/header.xml'" as="xs:string"/>
</xsl:call-template>
</xsl:template>
@@ -145,6 +178,7 @@
<xsl:template name="create_data">
<xsl:param name="text.xml" as="document-node()"/>
+ <xsl:param name="compoundID" as="xs:string"/>
<xsl:param name="target" as="xs:string"/>
<!-- create the data.xml file -->
<xsl:result-document encoding="UTF-8" method="xml" indent="yes"
@@ -168,6 +202,7 @@
<xsl:template name="create_struct">
<xsl:param name="text.xml" as="document-node()"/>
+ <xsl:param name="compoundID" as="xs:string"/>
<xsl:param name="ann_segmentation.xml" as="document-node()"/>
<xsl:param name="target" as="xs:string"/>
@@ -284,6 +319,7 @@
<xsl:template name="create_morpho">
<xsl:param name="text.xml" as="document-node()"/>
+ <xsl:param name="compoundID" as="xs:string"/>
<xsl:param name="ann_segmentation.xml" as="document-node()"/>
<xsl:param name="ann_morphosyntax.xml" as="document-node()"/>
<xsl:param name="target" as="xs:string"/>
@@ -388,11 +424,11 @@
<xsl:variable name="chosen-msd" as="xs:string" select="$my_choice-lex/descendant::tei:symbol[@xml:id eq $my_choice-id]/@value"/>
<xsl:variable name="preceding" select="preceding-sibling::*[local-name(.) eq $my_name]"/>
<xsl:variable name="preceding-count" select="count($preceding)"/>
- <xsl:variable name="outside-preceding-count" as="xs:integer">
+ <!--<xsl:variable name="outside-preceding-count" as="xs:integer">
<xsl:choose>
- <xsl:when test="self::tei:s or self::tei:p">
+ <xsl:when test="self::tei:s or self::tei:p"> <!-\- THIS NEEDS TO BE REVISITED AFTER THIS TEMPLATE HAS BECOME MORE SPECIFIC -\->
<xsl:choose>
- <xsl:when test="$preceding-count">
+ <xsl:when test="$preceding-count"> commented out for now
<xsl:sequence select="
sum(for $p in $preceding
return
@@ -407,9 +443,8 @@
<xsl:sequence select="0"/>
</xsl:otherwise>
</xsl:choose>
- </xsl:variable>
- <xsl:variable name="my_index" select="$index + 1 + $preceding-count + $outside-preceding-count"
- as="xs:integer"/>
+ </xsl:variable>-->
+ <xsl:variable name="my_index" select="$index + 1 + $preceding-count" as="xs:integer"/>
<xsl:variable name="start" as="xs:integer">
<xsl:variable name="numbers" select="substring-after(substring-before(@corresp,')'),',')"/>
@@ -454,17 +489,19 @@
</xsl:element>
</xsl:element>
</xsl:element>
- <xsl:apply-templates mode="morpho">
+ <xsl:apply-templates mode="morpho">
<xsl:with-param name="ini" select="$start" as="xs:integer"/>
<xsl:with-param name="fin" select="$end" as="xs:integer"/>
<xsl:with-param name="index" select="$my_index"/>
<xsl:with-param name="ann_morphosyntax.xml" select="$ann_morphosyntax.xml" as="document-node()"/>
- </xsl:apply-templates>
+ </xsl:apply-templates>-->
</xsl:template>
+
<!-- ************************** TEXT header ******************* -->
<xsl:template name="create_text_header">
<xsl:param name="text.xml" as="document-node()"/>
+ <xsl:param name="compoundID" as="xs:string"/>
<xsl:param name="target" as="xs:string"/>
<!-- create the local header.xml file -->
@@ -472,7 +509,9 @@
xpath-default-namespace="http://ids-mannheim.de/ns/KorAP" href="{$target}">
<idsHeader type="document" pattern="text" status="new" version="1.1" TEIform="teiHeader">
- <xsl:apply-templates select="$text.xml//tei:TEI/tei:teiHeader/tei:*" mode="text"/>
+ <xsl:apply-templates select="$text.xml//tei:TEI/tei:teiHeader/tei:*" mode="text">
+ <xsl:with-param name="compoundID" as="xs:string" select="$compoundID" tunnel="yes"/>
+ </xsl:apply-templates>
</idsHeader>
</xsl:result-document>
</xsl:template>
@@ -490,9 +529,10 @@
</xsl:template>
<xsl:template match="tei:titleStmt" mode="text">
+ <xsl:param name="compoundID" as="xs:string" tunnel="yes"/>
<titleStmt>
<textSigle>
- <xsl:value-of select="$corpusID || '/' || $docID || '.' || $textID"/>
+ <xsl:value-of select="$compoundID"/>
</textSigle>
<xsl:apply-templates mode="text"/>
</titleStmt>