before migration from calc_content_length to calc_offsets
Change-Id: I43d17d8350a59f689674499ecf9e6bc171fd9ac9
diff --git a/nkjp2korap.xsl b/nkjp2korap.xsl
index b4909ba..d8babb5 100644
--- a/nkjp2korap.xsl
+++ b/nkjp2korap.xsl
@@ -9,16 +9,19 @@
<!-- PARAMETERS -->
<xsl:param name="sourceDir" select="'test/resources/nkjp2korap_sample2'" as="xs:string"/>
- <!-- the directory containing NKJP files, in the form of a collection of text-level dirs -->
+ <!-- the directory containing NKJP files, in the form of a collection of text-level dirs
+ (that is how we know both the $corpusID and the $docID) -->
<xsl:param name="targetDir" select="'test/output'" as="xs:string"/>
+ <!-- where the corpus/document/text/annotations hierarchy is going to be created -->
<xsl:param name="skip_docID" as="xs:string">
<xsl:value-of select="'HellerPodgladanie,IsakowiczZaleskiMoje,KolakowskiOco,MysliwskiKamien,WilkWilczy,ZycieWarszawy_Zycie'"/>
</xsl:param>
<!-- comma-separated list of document IDs to be skipped from processing
example: HellerPodgladanie,KOT
- no functionality beyond string identity is supported -->
+ no functionality beyond string identity is supported
+ (this is just for testing) -->
<!-- VARIABLES -->
@@ -28,15 +31,14 @@
<xsl:variable name="targetCorpusDir_slashed" select="$targetDir || '/' || $corpusID || '/'" as="xs:string"/>
- <xsl:variable name="systemDoctypeI5"
- select="'http://corpora.ids-mannheim.de/I5/DTD/i5.dtd'" as="xs:string"
- static="true"/>
+ <xsl:variable name="systemDoctypeI5" as="xs:string"
+ select="'http://corpora.ids-mannheim.de/I5/DTD/i5.dtd'" static="true"/>
- <xsl:variable name="publicDoctypeI5" select="'-//IDS//DTD I5 1.0//EN'" as="xs:string"
- static="true"/>
+ <xsl:variable name="publicDoctypeI5" as="xs:string" static="true"
+ select="'-//IDS//DTD I5 1.0//EN'"/>
- <xsl:variable name="KorAP_namespace" select="'http://ids-mannheim.de/ns/KorAP'" static="true"
- as="xs:string"/>
+ <xsl:variable name="KorAP_namespace" static="true" as="xs:string"
+ select="'http://ids-mannheim.de/ns/KorAP'"/>
<xsl:variable name="KorAP-XML_version" select="'KorAP-0.4'" as="xs:string" static="true"/>
<!-- this is only a bit funny -->
@@ -47,6 +49,12 @@
<!-- see https://www.saxonica.com/documentation11/index.html#!sourcedocs/collections/collection-directories -->
<xsl:variable name="collection_of_text" select="collection($sourceDir || '?' || $collection_params)" as="document-node()+"/>
+
+<!-- these two 'flags' are meant to increase the readability of the code
+ they are used for the output of the calc_offsets() function, where the
+ returned value is a sequence, (start, end) -->
+ <xsl:variable name="OFFSET_START" as="xs:integer" static="yes" select="1"/>
+ <xsl:variable name="OFFSET_END" as="xs:integer" static="yes" select="2"/>
<!-- MODES -->
@@ -59,39 +67,225 @@
<!-- FUNCTIONS -->
<xsl:function name="f:compute_nesting" as="xs:integer">
- <xsl:param name="node" as="node()"/>
+ <xsl:param name="node" as="element()"/>
<xsl:variable name="rel_depth"
select="count($node/ancestor-or-self::*[local-name(.) ne 'TEI'][local-name(.) ne 'teiCorpus'])"
as="xs:integer"/>
<xsl:sequence select="$rel_depth"/>
</xsl:function>
+
+ <xsl:function name="f:calc_offsets" as="xs:integer+">
+ <xsl:param name="node" as="element()"/>
+ <xsl:param name="skip_start" as="xs:boolean" />
+
+ <xsl:variable name="start" as="xs:integer">
+ <xsl:choose>
+
+ <xsl:when test="$skip_start or $node/self::tei:text or $node/self::tei:body">
+ <xsl:sequence select="0"/>
+ </xsl:when>
+
+ <!-- handle p -->
+
+ <xsl:when test="$node/self::tei:p">
+ <xsl:variable name="my_pos" as="xs:integer" select="count($node/preceding-sibling::tei:p) + 1"/>
+ <xsl:variable name="preceding" as="node()*"
+ select="$node/ancestor::tei:body/tei:p[position() lt $my_pos]"/>
+
+ <xsl:choose>
+ <xsl:when test="count($preceding) eq 0">
+ <xsl:sequence select="0"/>
+ </xsl:when>
+ <xsl:otherwise>
+ <xsl:sequence select="sum(f:calc_offsets($preceding[last()],true())[$OFFSET_END],1)"/>
+
+<!-- BUG danger: I am not sure if a "1" should rather be added after each p; let me try to handle that in the return value of the $length variable,
+ and make it sensitive to the skip_start parameter
+
+ I will then have to remove the ",1" from here!
+
+ -->
+
+<!-- <xsl:variable name="last_corresps"
+ select="$preceding/descendant::tei:s[last()]/(descendant::tei:seg[count(@nkjp:rejected) eq 0 or @nkjp:rejected ne 'true'])[last()]/@corresp"
+ as="attribute(corresp)+"/>
+ <xsl:variable name="end_offsets" as="xs:integer+">
+ <xsl:for-each select="$last_corresps">
+ <xsl:variable name="numbers"
+ select="substring-after(substring-before(., ')'), ',')"/>
+ <xsl:sequence
+ select="xs:integer(substring-before($numbers, ',')) + xs:integer(substring-after($numbers, ','))"
+ />
+ </xsl:for-each>
+ </xsl:variable>
+ <xsl:sequence select="sum($end_offsets, 1)"/>
+
+ this is a non-recursive variant that may turn out to be much less cpu-intensive, not sure
+ - but if it's plugged in, it will have to be adjusted to the current form of the recursive variant,
+ because it hasn't been maintained since it got commented out
+ -->
+ </xsl:otherwise>
+ </xsl:choose>
+ </xsl:when>
+
+ <!-- handle s -->
+
+<!-- the value for s gets counted since the start of the current p
+ - so we look at the preceding s's
+ + the preceding p's
+ -->
+ <xsl:when test="$node/self::tei:s">
+ <!--<xsl:variable name="last_corresp"
+ select="$node/descendant::tei:seg[count(@nkjp:rejected) eq 0 or @nkjp:rejected ne 'true'][last()]/attribute::corresp"
+ as="attribute(corresp)"/>
+ <xsl:variable name="numbers"
+ select="substring-after(substring-before($last_corresp, ')'), ',')"/>
+ <xsl:sequence
+ select="xs:integer(substring-before($numbers, ',')) + xs:integer(substring-after($numbers, ','))"
+ />
+ -->
+
+ <xsl:variable name="internal_start" as="xs:integer">
+ <xsl:variable name="my_pos" as="xs:integer" select="count($node/preceding-sibling::tei:s) + 1"/>
+ <xsl:variable name="preceding" as="node()*"
+ select="$node/ancestor::tei:p[1]/tei:s[position() lt $my_pos]"/>
+
+ <xsl:choose>
+ <xsl:when test="count($preceding) eq 0">
+ <xsl:sequence select="0"/>
+ </xsl:when>
+ <xsl:otherwise>
+ <xsl:sequence select="sum(f:calc_offsets($preceding[last()],true())[$OFFSET_END],1)"/>
+ <!-- again, CAREFUL ABOUT THE +1, it might need to vanish -->
+ </xsl:otherwise>
+ </xsl:choose>
+ </xsl:variable>
+
+ <xsl:variable name="external_start" as="xs:integer" select="f:calc_offsets($node/ancestor::tei:p[1],true())"/>
+
+ <xsl:sequence select="$internal_start + $external_start"/>
+ </xsl:when>
+
+ <!-- handle seg -->
+
+ <xsl:when test="$node/self::tei:seg">
+ <!-- for segs, the s elements are irrelevant, and the local offset is immediately available on the @corresp -->
+
+ <xsl:variable name="numbers"
+ select="substring-after(substring-before($node/@corresp, ')'), ',')"/>
+
+ <xsl:variable name="internal_start" select="xs:integer(substring-before($numbers, ','))"
+ as="xs:integer"/>
+ <xsl:variable name="external_start" as="xs:integer"
+ select="f:calc_offsets($node/ancestor::tei:p[1], true())"/>
+
+ <xsl:if test="$node/self::tei:seg and count($node/@nkjp:rejected)">
+
+ <xsl:message select="'numbers: ' || $numbers"/>
+ </xsl:if>
+ <xsl:sequence select="$internal_start + $external_start"/>
+ </xsl:when>
+ </xsl:choose>
+ </xsl:variable>
+
+ <xsl:variable name="length" as="xs:integer">
+ <xsl:choose>
+
+ <xsl:when test="$node/self::tei:text or $node/self::tei:body">
+ <xsl:variable name="last_corresps"
+ select="$node/descendant::tei:p/descendant::tei:s[last()]/(descendant::tei:seg[count(@nkjp:rejected) eq 0 or @nkjp:rejected ne 'true'])[last()]/@corresp"
+ as="attribute(corresp)+"/>
+
+ <xsl:variable name="end_offsets" as="xs:integer+">
+ <xsl:for-each select="$last_corresps">
+ <xsl:variable name="numbers" select="substring-after(substring-before(., ')'), ',')"/>
+ <xsl:sequence
+ select="xs:integer(substring-before($numbers, ',')) + xs:integer(substring-after($numbers, ','))"
+ />
+ </xsl:for-each>
+ </xsl:variable>
+
+ <xsl:sequence select="sum($end_offsets)"/>
+
+ </xsl:when>
+ <xsl:when test="$node/self::tei:p">
+ <xsl:variable name="last_corresps"
+ select="$node/descendant::tei:s[last()]/(descendant::tei:seg[count(@nkjp:rejected) eq 0 or @nkjp:rejected ne 'true'])[last()]/@corresp"
+ as="attribute(corresp)+"/>
+ <xsl:variable name="end_offsets" as="xs:integer+">
+ <xsl:for-each select="$last_corresps">
+ <xsl:variable name="numbers" select="substring-after(substring-before(., ')'), ',')"/>
+ <xsl:sequence
+ select="xs:integer(substring-before($numbers, ',')) + xs:integer(substring-after($numbers, ','))"
+ />
+ </xsl:for-each>
+ </xsl:variable>
+ <xsl:sequence select="sum($end_offsets)"/>
+ </xsl:when>
+
+
+
+
+ <xsl:when test="$node/self::tei:s">
+ <xsl:variable name="last_corresp"
+ select="$node/descendant::tei:seg[count(@nkjp:rejected) eq 0 or @nkjp:rejected ne 'true'][last()]/attribute::corresp"
+ as="attribute(corresp)"/>
+ <xsl:variable name="numbers"
+ select="substring-after(substring-before($last_corresp, ')'), ',')"/>
+ <xsl:sequence
+ select="xs:integer(substring-before($numbers, ',')) + xs:integer(substring-after($numbers, ','))"
+ />
+ </xsl:when>
+ <xsl:otherwise>
+ <xsl:variable name="numbers"
+ select="substring-after(substring-before($node/@corresp, ')'), ',')"/>
+ <xsl:if test="$node/self::tei:seg and count($node/@nkjp:rejected)">
+ <!-- REMOVE THIS -->
+ <xsl:message select="$numbers"/>
+ </xsl:if>
+ <xsl:sequence
+ select="xs:integer(substring-before($numbers, ',')) + xs:integer(substring-after($numbers, ','))"
+ />
+ </xsl:otherwise>
+ </xsl:choose>
+ </xsl:variable>
+
+ <xsl:message select="'length: ' || $length"/>
+
+ <xsl:sequence select="$start, $start + $length -1"/>
+ </xsl:function>
+
<xsl:function name="f:calc_content_length" as="xs:integer">
<xsl:param name="node" as="node()"/>
<xsl:choose>
<xsl:when test="$node/self::tei:text or $node/self::tei:body">
<xsl:variable name="last_corresp"
- select="$node/descendant::tei:p[last()]/descendant::tei:s[last()]/descendant::tei:seg[last()]/attribute::corresp"
+ select="$node/descendant::tei:p[last()]/descendant::tei:s[last()]/descendant::tei:seg[count(@nkjp:rejected) eq 0 or @nkjp:rejected ne 'true'][last()]/attribute::corresp"
as="attribute(corresp)"/>
<xsl:variable name="numbers" select="substring-after(substring-before($last_corresp,')'),',')"/>
<xsl:sequence select="xs:integer(substring-before($numbers,',')) + xs:integer(substring-after($numbers,','))"/>
</xsl:when>
<xsl:when test="$node/self::tei:p">
<xsl:variable name="last_corresp"
- select="$node/descendant::tei:s[last()]/descendant::tei:seg[last()]/attribute::corresp"
+ select="$node/descendant::tei:s[last()]/descendant::tei:seg[count(@nkjp:rejected) eq 0 or @nkjp:rejected ne 'true'][last()]/attribute::corresp"
as="attribute(corresp)"/>
<xsl:variable name="numbers" select="substring-after(substring-before($last_corresp,')'),',')"/>
<xsl:sequence select="xs:integer(substring-before($numbers,',')) + xs:integer(substring-after($numbers,','))"/>
</xsl:when>
<xsl:when test="$node/self::tei:s">
<xsl:variable name="last_corresp"
- select="$node/descendant::tei:seg[last()]/attribute::corresp"
+ select="$node/descendant::tei:seg[count(@nkjp:rejected) eq 0 or @nkjp:rejected ne 'true'][last()]/attribute::corresp"
as="attribute(corresp)"/>
<xsl:variable name="numbers" select="substring-after(substring-before($last_corresp,')'),',')"/>
<xsl:sequence select="xs:integer(substring-before($numbers,',')) + xs:integer(substring-after($numbers,','))"/>
</xsl:when>
<xsl:otherwise>
<xsl:variable name="numbers" select="substring-after(substring-before($node/@corresp,')'),',')"/>
+ <xsl:if test="$node/self::tei:seg and count($node/@nkjp:rejected)">
+ <!-- REMOVE THIS -->
+ <xsl:message select="$numbers"/>
+ </xsl:if>
<xsl:sequence select="xs:integer(substring-before($numbers,',')) + xs:integer(substring-after($numbers,','))"/>
</xsl:otherwise>
</xsl:choose>
@@ -122,7 +316,7 @@
<xsl:template name="xsl:initial-template">
<xsl:variable name="IDs_to_skip" select="tokenize($skip_docID,',')" as="xs:string*"/>
-
+
<!-- we only want to call the template below once, and we process a random NKJP corpus file for that purpose,
because all we need is the main corpus header, and we can (should) get to that from any NKJP corpus document -->
<xsl:call-template name="create_corpus_header">
@@ -139,14 +333,17 @@
<xsl:choose>
<xsl:when test="$my_textID = $IDs_to_skip"/>
<xsl:otherwise>
- <xsl:call-template name="process_single_sample">
+
+ <xsl:message select="f:calc_offsets(doc($ann_segmentation.uri)//tei:body/tei:p[4],false())"/>
+
+ <!--<xsl:call-template name="process_single_sample">
<xsl:with-param name="text.xml" as="document-node()" select="."/>
<xsl:with-param name="ann_morphosyntax.xml" as="document-node()"
select="doc($ann_morphosyntax.uri)"/>
<xsl:with-param name="ann_segmentation.xml" as="document-node()"
select="doc($ann_segmentation.uri)"/>
<xsl:with-param name="my_textID" select="$my_textID" as="xs:string"/>
- </xsl:call-template>
+ </xsl:call-template>-->
</xsl:otherwise>
</xsl:choose>
</xsl:for-each>
@@ -156,7 +353,7 @@
<xsl:param name="text.xml" as="document-node()"/>
<xsl:param name="ann_morphosyntax.xml" as="document-node()"/>
<xsl:param name="ann_segmentation.xml" as="document-node()"/>
- <xsl:param name="my_textID" as="xs:string" select="'0BAD_textID'"/>
+ <xsl:param name="my_textID" as="xs:string" select="'0-BAD_textID'"/>
<xsl:variable name="targetBaseDir" as="xs:string" select="$targetCorpusDir_slashed || $docID || '/' || $my_textID"/>
@@ -216,7 +413,8 @@
</xsl:element>
<xsl:element name="text" namespace="{$KorAP_namespace}">
- <xsl:value-of select="$text.xml//*[local-name() = 'ab']"/>
+ <!--<xsl:value-of select="$text.xml//*[local-name() = 'ab']"/>-->
+ <xsl:apply-templates select="$text.xml//*[local-name() = 'ab']"/>
</xsl:element>
</xsl:element>
</xsl:result-document>