derive structure.xml; the script isn't optimized yet but I would like to submit the output for a check
Change-Id: Ib7ae6aed1e661490dbd2b37d7818205a7ec50441
diff --git a/nkjp2korap.xsl b/nkjp2korap.xsl
index e186ef1..46b4bb6 100644
--- a/nkjp2korap.xsl
+++ b/nkjp2korap.xsl
@@ -26,24 +26,94 @@
<xsl:variable name="KorAP_namespace" select="'http://ids-mannheim.de/ns/KorAP'" static="true"
as="xs:string"/>
+ <xsl:variable name="KorAP-XML_version" select="'KorAP-0.4'" as="xs:string" static="true"/>
+ <!-- this is only a bit funny -->
+
+ <xsl:variable name="compoundID" as="xs:string"
+ select="$corpusID || '_' || $docID || '.' || $textID"/>
+ <!-- this is what occurs in the text and data layers as @docid -->
<xsl:mode name="corpus" on-no-match="deep-skip"/>
<xsl:mode name="text" on-no-match="deep-skip"/>
+ <!-- <xsl:variable name="text_depth" as="xs:integer" select="xs:integer('2')" static="true"/>
+ <!-\- this magic number indicates the depth of the <TEI> element inside teiCorpus/TEI -\->
+-->
+ <xsl:function name="f:compute_nesting" as="xs:integer">
+ <xsl:param name="node" as="node()"/>
+ <xsl:variable name="rel_depth"
+ select="count($node/ancestor-or-self::*[local-name(.) ne 'TEI'][local-name(.) ne 'teiCorpus'])"
+ as="xs:integer"/>
+<!-- I think my skills are lacking -->
+ <xsl:sequence select="$rel_depth"/>
+ </xsl:function>
+
+ <xsl:function name="f:calc_content_length" as="xs:integer">
+ <xsl:param name="node" as="node()"/>
+ <xsl:choose>
+ <xsl:when test="$node/self::tei:text or $node/self::tei:body">
+ <xsl:variable name="last_corresp"
+ select="$node/descendant::tei:p[last()]/descendant::tei:s[last()]/descendant::tei:seg[last()]/attribute::corresp"
+ as="attribute(corresp)"/>
+ <xsl:variable name="numbers" select="substring-after(substring-before($last_corresp,')'),',')"/>
+ <xsl:sequence select="xs:integer(substring-before($numbers,',')) + xs:integer(substring-after($numbers,','))"/>
+ </xsl:when>
+ <xsl:when test="$node/self::tei:p">
+ <xsl:variable name="last_corresp"
+ select="$node/descendant::tei:s[last()]/descendant::tei:seg[last()]/attribute::corresp"
+ as="attribute(corresp)"/>
+ <xsl:variable name="numbers" select="substring-after(substring-before($last_corresp,')'),',')"/>
+ <xsl:sequence select="xs:integer(substring-before($numbers,',')) + xs:integer(substring-after($numbers,','))"/>
+ </xsl:when>
+ <xsl:when test="$node/self::tei:s">
+ <xsl:variable name="last_corresp"
+ select="$node/descendant::tei:seg[last()]/attribute::corresp"
+ as="attribute(corresp)"/>
+ <xsl:variable name="numbers" select="substring-after(substring-before($last_corresp,')'),',')"/>
+ <xsl:sequence select="xs:integer(substring-before($numbers,',')) + xs:integer(substring-after($numbers,','))"/>
+ </xsl:when>
+ <xsl:otherwise>
+ <xsl:variable name="numbers" select="substring-after(substring-before($node/@corresp,')'),',')"/>
+ <xsl:sequence select="xs:integer(substring-before($numbers,',')) + xs:integer(substring-after($numbers,','))"/>
+ </xsl:otherwise>
+ </xsl:choose>
+ </xsl:function>
<xsl:template name="xsl:initial-template">
<xsl:variable name="text.xml" as="document-node()" select="doc($sourceDir || 'text.xml')"/>
+ <xsl:variable name="ann_morphosyntax.xml" as="document-node()"
+ select="doc($sourceDir || 'ann_morphosyntax.xml')"/>
+ <xsl:variable name="ann_segmentation.xml" as="document-node()"
+ select="doc($sourceDir || 'ann_segmentation.xml')"/>
<xsl:call-template name="create_data">
<xsl:with-param name="text.xml" select="$text.xml" as="document-node()"/>
+ <xsl:with-param name="target" select="$targetTextDir || 'data.xml'" as="xs:string"/>
+ </xsl:call-template>
+
+ <xsl:call-template name="create_struct">
+ <xsl:with-param name="text.xml" select="$text.xml" as="document-node()"/>
+ <xsl:with-param name="ann_segmentation.xml" select="$ann_segmentation.xml"
+ as="document-node()"/>
+ <xsl:with-param name="target" select="$targetTextDir || 'struct/structure.xml'" as="xs:string"
+ />
+ </xsl:call-template>
+
+ <xsl:call-template name="create_morpho">
+ <xsl:with-param name="text.xml" select="$text.xml" as="document-node()"/>
+ <xsl:with-param name="ann_morphosyntax.xml" select="$ann_morphosyntax.xml"
+ as="document-node()"/>
+ <xsl:with-param name="target" select="$targetTextDir || 'nkjp/morpho.xml'" as="xs:string"/>
</xsl:call-template>
<xsl:call-template name="create_text_header">
<xsl:with-param name="text.xml" select="$text.xml" as="document-node()"/>
+ <xsl:with-param name="target" select="$targetTextDir || 'header.xml'" as="xs:string"/>
</xsl:call-template>
<xsl:call-template name="create_corpus_header">
<xsl:with-param name="text.xml" select="$text.xml" as="document-node()"/>
+ <xsl:with-param name="target" select="$targetCorpusDir || 'header.xml'" as="xs:string"/>
</xsl:call-template>
</xsl:template>
@@ -51,13 +121,14 @@
<xsl:template name="create_data">
<xsl:param name="text.xml" as="document-node()"/>
+ <xsl:param name="target" as="xs:string"/>
<!-- create the data.xml file -->
<xsl:result-document encoding="UTF-8" method="xml" indent="yes"
- xpath-default-namespace="{$KorAP_namespace}" href="{$targetTextDir || 'data.xml'}">
+ xpath-default-namespace="{$KorAP_namespace}" href="{$target}">
<xsl:processing-instruction name="xml-model">href="text.rng" type="application/xml" schematypens="http://relaxng.org/ns/structure/1.0"</xsl:processing-instruction>
<xsl:element name="raw_text" namespace="{$KorAP_namespace}">
- <xsl:attribute name="docid" select="$corpusID || '_' || $docID || '.' || $textID"/>
+ <xsl:attribute name="docid" select="$compoundID"/>
<xsl:element name="metadata" namespace="{$KorAP_namespace}">
<xsl:attribute name="file" select="'metadata.xml'"/>
</xsl:element>
@@ -69,13 +140,179 @@
</xsl:result-document>
</xsl:template>
+ <!-- ************************** struct ******************* -->
+
+ <xsl:template name="create_struct">
+ <xsl:param name="text.xml" as="document-node()"/>
+ <xsl:param name="ann_segmentation.xml" as="document-node()"/>
+ <xsl:param name="target" as="xs:string"/>
+
+ <xsl:result-document encoding="UTF-8" method="xml" indent="yes"
+ xpath-default-namespace="http://ids-mannheim.de/ns/KorAP" href="{$target}">
+ <xsl:processing-instruction name="xml-model">href="span.rng" type="application/xml" schematypens="http://relaxng.org/ns/structure/1.0"</xsl:processing-instruction>
+ <xsl:element name="layer" namespace="{$KorAP_namespace}">
+ <xsl:attribute name="docid" select="$compoundID"/>
+ <xsl:attribute name="version" select="$KorAP-XML_version"/>
+
+ <xsl:element name="spanList" namespace="{$KorAP_namespace}">
+ <xsl:apply-templates select="$ann_segmentation.xml//tei:text" mode="struct"/>
+ </xsl:element>
+ </xsl:element>
+ </xsl:result-document>
+ </xsl:template>
+
+ <xsl:template match="tei:*" mode="struct">
+ <xsl:param name="ini" as="xs:integer" required="no" select="0"/>
+ <xsl:param name="fin" as="xs:integer" required="no" select="999999999"/>
+ <xsl:param name="index" as="xs:integer" required="no" select="1"/>
+ <!-- I have made a major mess here, but it works... it's so spread out
+ because I wanted to make sure to be able to look up the individual
+ constituent values, should anything go wrong -->
+ <xsl:variable name="my_name" select="local-name()" as="xs:string"/>
+ <xsl:variable name="preceding" select="preceding-sibling::*[local-name(.) eq $my_name]"/>
+ <xsl:variable name="preceding-count" select="count($preceding)"/>
+ <xsl:variable name="outside-preceding-count" as="xs:integer">
+ <xsl:choose>
+ <xsl:when test="self::tei:s or self::tei:p">
+ <xsl:choose>
+ <xsl:when test="$preceding-count">
+ <xsl:sequence select="
+ sum(for $p in $preceding
+ return
+ count($p/descendant::*))"/>
+ </xsl:when>
+ <xsl:otherwise>
+ <xsl:sequence select="0"/>
+ </xsl:otherwise>
+ </xsl:choose>
+ </xsl:when>
+ <xsl:otherwise>
+ <xsl:sequence select="0"/>
+ </xsl:otherwise>
+ </xsl:choose>
+ </xsl:variable>
+ <xsl:variable name="my_index" select="$index + 1 + $preceding-count + $outside-preceding-count"
+ as="xs:integer"/>
+
+ <xsl:variable name="start" as="xs:integer">
+ <xsl:choose>
+ <xsl:when test="self::tei:text or self::tei:body">
+ <xsl:sequence select="0"/>
+ </xsl:when>
+ <xsl:when test="self::tei:p">
+ <xsl:variable name="first_corresp"
+ select="descendant::tei:s[1]/descendant::tei:seg[1]/attribute::corresp"
+ as="attribute(corresp)"/>
+ <xsl:variable name="numbers" select="substring-after(substring-before($first_corresp,')'),',')"/>
+ <xsl:sequence select="xs:integer(substring-before($numbers,','))"/>
+ </xsl:when>
+ <xsl:when test="self::tei:s">
+ <xsl:variable name="first_corresp"
+ select="descendant::tei:seg[1]/attribute::corresp"
+ as="attribute(corresp)"/>
+ <xsl:variable name="numbers" select="substring-after(substring-before($first_corresp,')'),',')"/>
+ <xsl:sequence select="xs:integer(substring-before($numbers,','))"/>
+ </xsl:when>
+ <xsl:when test="self::tei:seg">
+ <xsl:variable name="numbers" select="substring-after(substring-before(@corresp,')'),',')"/>
+ <xsl:sequence select="xs:integer(substring-before($numbers,','))"/>
+ </xsl:when>
+ </xsl:choose>
+ </xsl:variable>
+ <xsl:variable name="end" as="xs:integer" select="f:calc_content_length(.)">
+ </xsl:variable>
+ <xsl:element name="span" namespace="{$KorAP_namespace}">
+ <xsl:attribute name="id" select="'s' || $my_index"/>
+ <xsl:attribute name="from" select="$start"/>
+ <xsl:attribute name="to" select="$end"/>
+ <xsl:attribute name="l" select="f:compute_nesting(.)"/>
+ <xsl:element name="fs" namespace="http://www.tei-c.org/ns/1.0">
+ <xsl:element name="f" namespace="http://www.tei-c.org/ns/1.0">
+ <xsl:attribute name="name" select="local-name()"/>
+ </xsl:element>
+ <xsl:if test="count(@*)">
+ <xsl:element name="f" namespace="http://www.tei-c.org/ns/1.0">
+ <xsl:attribute name="name" select="'attr'"/>
+ <xsl:element name="fs" namespace="http://www.tei-c.org/ns/1.0">
+ <xsl:attribute name="type" select="'attr'"/>
+ <xsl:for-each select="@*">
+ <xsl:element name="f" namespace="http://www.tei-c.org/ns/1.0">
+ <xsl:attribute name="name" select="local-name(.)"/>
+ <xsl:value-of select="."/>
+ </xsl:element>
+ </xsl:for-each>
+ </xsl:element>
+ </xsl:element>
+ </xsl:if>
+ </xsl:element>
+ </xsl:element>
+ <xsl:apply-templates mode="struct">
+ <xsl:with-param name="ini" select="$start" as="xs:integer"/>
+ <xsl:with-param name="fin" select="$end" as="xs:integer"/>
+ <xsl:with-param name="index" select="$my_index"/>
+ </xsl:apply-templates>
+ </xsl:template>
+
+ <!-- ************************** morpho ******************* -->
+
+ <xsl:template name="create_morpho">
+ <xsl:param name="text.xml" as="document-node()"/>
+ <xsl:param name="ann_morphosyntax.xml" as="document-node()"/>
+ <xsl:param name="target" as="xs:string"/>
+
+ <xsl:result-document encoding="UTF-8" method="xml" indent="yes"
+ xpath-default-namespace="http://ids-mannheim.de/ns/KorAP" href="{$target}">
+ <xsl:processing-instruction name="xml-model">href="span.rng" type="application/xml" schematypens="http://relaxng.org/ns/structure/1.0"</xsl:processing-instruction>
+
+ </xsl:result-document>
+ </xsl:template>
+
+
+ <!-- ************************** TEXT header ******************* -->
+
+ <xsl:template name="create_text_header">
+ <xsl:param name="text.xml" as="document-node()"/>
+ <xsl:param name="target" as="xs:string"/>
+
+ <!-- create the local header.xml file -->
+ <xsl:result-document encoding="UTF-8" method="xml" indent="yes"
+ xpath-default-namespace="http://ids-mannheim.de/ns/KorAP" href="{$target}">
+
+ <idsHeader type="document" pattern="text" status="new" version="1.1" TEIform="teiHeader">
+ <xsl:apply-templates select="$text.xml//tei:TEI/tei:teiHeader/tei:fileDesc" mode="text"/>
+ </idsHeader>
+ </xsl:result-document>
+ </xsl:template>
+
+ <xsl:template match="tei:fileDesc" mode="text">
+ <fileDesc>
+ <xsl:apply-templates mode="text"/>
+ </fileDesc>
+ </xsl:template>
+
+ <xsl:template match="tei:title" mode="text">
+ <t.title>
+ <xsl:apply-templates/>
+ </t.title>
+ </xsl:template>
+
+ <xsl:template match="tei:titleStmt" mode="text">
+ <titleStmt>
+ <textSigle>
+ <xsl:value-of select="$corpusID || '/' || $textID"/>
+ </textSigle>
+ <xsl:apply-templates mode="text"/>
+ </titleStmt>
+ </xsl:template>
+
+
<!-- ************************** CORPUS header ******************* -->
<xsl:template name="create_corpus_header">
<xsl:param name="text.xml" as="document-node()"/>
+ <xsl:param name="target" as="xs:string"/>
<!-- create the corpus-level header.xml file -->
- <xsl:result-document encoding="UTF-8" method="xml" indent="yes"
- href="{$targetCorpusDir || 'header.xml'}">
+ <xsl:result-document encoding="UTF-8" method="xml" indent="yes" href="{$target}">
<!--doctype-public="{$publicDoctypeI5}"
doctype-system="{$systemDoctypeI5}">
@@ -110,43 +347,6 @@
</xsl:template>
- <!-- ************************** TEXT header ******************* -->
-
- <xsl:template name="create_text_header">
- <xsl:param name="text.xml" as="document-node()"/>
-
- <!-- create the local header.xml file -->
- <xsl:result-document encoding="UTF-8" method="xml" indent="yes"
- xpath-default-namespace="http://ids-mannheim.de/ns/KorAP"
- href="{$targetTextDir || 'header.xml'}">
-
- <idsHeader type="document" pattern="text" status="new" version="1.1" TEIform="teiHeader">
- <xsl:apply-templates select="$text.xml//tei:TEI/tei:teiHeader/tei:fileDesc" mode="text"/>
- </idsHeader>
- </xsl:result-document>
- </xsl:template>
-
- <xsl:template match="tei:fileDesc" mode="text">
- <fileDesc>
- <xsl:apply-templates mode="text"/>
- </fileDesc>
- </xsl:template>
-
- <xsl:template match="tei:title" mode="text">
- <t.title>
- <xsl:apply-templates/>
- </t.title>
- </xsl:template>
-
- <xsl:template match="tei:titleStmt" mode="text">
- <titleStmt>
- <textSigle>
- <xsl:value-of select="$corpusID || '/' || $textID"/>
- </textSigle>
- <xsl:apply-templates mode="text"/>
- </titleStmt>
- </xsl:template>
-