this version attempts to re-traverse the tree over 6k times per single output document with structure in it, and I can't seem to be able to help that. It does the necessary calculation perfectly, but, naturally, in doing so it crashes my desktop
Change-Id: Ibbbf76498f26ec6e5f4ac12cc688d852d89157f1
diff --git a/nkjp2korap.xsl b/nkjp2korap.xsl
index acc425d..126752d 100644
--- a/nkjp2korap.xsl
+++ b/nkjp2korap.xsl
@@ -3,7 +3,7 @@
xmlns:xs="http://www.w3.org/2001/XMLSchema" xmlns:nkjp="http://www.nkjp.pl/ns/1.0"
xmlns:tei="http://www.tei-c.org/ns/1.0" xmlns:f="func"
xmlns:fn="http://www.w3.org/2005/xpath-functions"
- xmlns:map="http://www.w3.org/2005/xpath-functions/map" exclude-result-prefixes="xs f map nkjp tei"
+ xmlns:map="http://www.w3.org/2005/xpath-functions/map" exclude-result-prefixes="xs f fn map nkjp tei"
version="3.0" expand-text="yes">
@@ -67,6 +67,32 @@
<xsl:mode name="corpus" on-no-match="deep-skip"/>
<xsl:mode name="text" on-no-match="deep-skip"/>
<xsl:mode name="header-text" on-no-match="text-only-copy"/>
+ <xsl:mode use-accumulators="#all"/>
+
+ <xsl:accumulator name="elem-offset-seq" as="map(xs:string, item())+" initial-value="(map{})">
+ <xsl:accumulator-rule match="tei:w[parent::tei:seg[count(@nkjp:rejected) eq 0]]" phase="end">
+ <xsl:variable name="previous_index" as="xs:integer">
+ <xsl:choose>
+ <xsl:when test="count($value) eq 1">
+ <xsl:sequence select="0"/>
+ </xsl:when>
+ <xsl:otherwise>
+ <xsl:sequence select="map:get(head(reverse($value)),'end')"/>
+ </xsl:otherwise>
+ </xsl:choose>
+ </xsl:variable>
+
+<!--<xsl:message select="'previous_index:' || $previous_index"></xsl:message>-->
+
+ <xsl:sequence select="
+ $value,
+ map {
+ 'id': string(parent::tei:seg/@xml:id),
+ 'start': $previous_index + xs:integer(f:is_preceded_by_ws(parent::tei:seg)),
+ 'end': string-length()
+ }"/>
+ </xsl:accumulator-rule>
+ </xsl:accumulator>
<!-- FUNCTIONS -->
@@ -105,29 +131,29 @@
</xsl:choose>
</xsl:function>
-
-
-
-
<!-- UTILITY TEMPLATES -->
<xsl:template match="@default" mode="#all"/>
<!-- this is to delete some auto-inserted attribute throughout -->
- <!--<xsl:template match="tei:w" mode="#all"/> w is better than ab, now ... -->
+ <xsl:template match="tei:w" mode="#all"/>
<!-- NKJP-SGJP has apparently resigned from standoff representations by adding <w> everywhere;
for the time being, we'll just stick to the standoff offsets, although that may need to
be revisited as the NKJP format has now began to stray from its schemas and assumptions -->
- <xsl:template match="tei:choice" mode="#all"/>
+ <!--<xsl:template match="tei:choice" mode="#all"/>-->
<!-- THIS IS ONLY TEMPORARY,
because an interesting challenge came up where I will
probably have to abandon straightforward mapping because of TOKENIZATION alternatives;
but now, I just want this stylesheet to work, even if it eats some occasional token (which it now does, 'komuĊ' and 'czym' vanish)
-->
-
+
+ <!--fall-thru-->
+ <xsl:template match="tei:choice" mode="struct">
+ <xsl:apply-templates select="descendant::tei:seg"/>
+ </xsl:template>
<!-- MAIN PROCESSING -->
@@ -185,6 +211,116 @@
select="$corpusID || '_' || $docID || '.' || $my_textID"/>
<!-- this is what occurs in the text and data layers as @docid -->
+<!-- this is space devoted to recalculating word offsets on the basis of ann_segmentation.xml (rather than text.xml).
+ The results should be available to all annotation files, so we prepare a map here and send it off to whichever
+ annotation layer needs it. -->
+
+<!-- This is done is several steps, because I wanted to be able to look stuff up. There should probably be some
+ idiomatic way to reduce the memory footprint of these variables - I'll be happy to learn about it. -->
+
+ <!--<xsl:variable name="map_w" as="map(xs:untypedAtomic,item()+)">
+ <xsl:variable name="segs" select="$ann_segmentation.xml/tei:teiCorpus/tei:TEI/tei:text/tei:body/tei:p/tei:s//tei:seg[count(@nkjp:rejected) eq 0]" as="element()+"/>
+ <xsl:map>
+ <xsl:for-each-group select="$segs" group-by="ancestor::tei:p[1]/@xml:id">
+ <xsl:variable name="current-p" select="current-grouping-key()"/>
+ <xsl:for-each-group select="current-group()" group-by="ancestor::tei:s[1]/@xml:id">
+ <xsl:variable name="current-s" select="current-grouping-key()"/>
+ <xsl:for-each select="current-group()">
+ <xsl:map-entry key="@xml:id" select="$current-p, $current-s, position(), f:is_preceded_by_ws(.), normalize-space(tei:w)"/>
+ </xsl:for-each>
+ </xsl:for-each-group>
+ </xsl:for-each-group>
+ </xsl:map>
+ </xsl:variable>-->
+
+ <!--<xsl:variable name="map_p-s-w" as="map(xs:untypedAtomic,item()+)">
+ <xsl:map>
+ <xsl:for-each-group select="map:keys($map_w)" group-by="map:get($map_w, .)[1]">
+ <xsl:sort select="xs:integer(substring-before(substring-after(., 'segm_'), '.'))"
+ order="ascending"/>
+ <xsl:sort select="xs:integer(substring-before(substring-after(., '.'), '-'))"
+ order="ascending"/>
+ <xsl:variable name="current-p-pos" select="fn:position()" as="xs:integer"/>
+ <!-\- the above is used in the sentence loop, when we check if it's text-initial -\->
+ <xsl:variable name="current-p" select="fn:current-grouping-key()" as="xs:string"/> <!-\-xs:untypedAtomic-\->
+ <xsl:variable name="p-length" select="
+ sum(for $id in current-group()
+ return
+ string-length(map:get($map_w, $id)[5])) + count(current-group()) - xs:integer(position() ne 1) -
+ count(fn:filter(current-group(), function ($w-id) {
+ map:get($map_w, $w-id)[4] eq false()
+ }))"/>
+ <!-\- The general algorithm is:
+ * count and sum the lengths of all the words
+ * add 'whitespace' for all of them (= count them and add that), and then
+ * subtract whitespace for those of them that are not actually preceded by it
+ and if the 1st word is_preceded_by_ws then subtract 1
+ because identifying that 1st word would require an extra step, we're taking a shortcut via position() -
+ and that strongly depends on the presence of the xsl:sort instructions -\->
+
+ <xsl:message select="'sum: ' || sum( for $id in current-group() return string-length(map:get($map_w, $id)[5]) )"/>
+ <!-\-<xsl:message select="for $id in current-group() return (string-length(map:get($map_w, $id)[5]),map:get($map_w, $id)[4] )"/>-\->
+ <xsl:message select="'cur-group count: ' || count(fn:current-group())"/>
+ <!-\-<xsl:message select="fn:for-each(current-group(), function($w-id) { map:get($map_w,$w-id)[4] eq false() } )"></xsl:message>-\->
+ <xsl:message select="'subtract:' || count(fn:filter(current-group(), function($w-id) { map:get($map_w,$w-id)[4] eq false() } ))"></xsl:message>
+ <xsl:message select="'position: ' || position() || ', xs:integer(position() ne 1)=' || xs:integer(position() ne 1)"></xsl:message>
+ <xsl:message select="'p-length: ' || $p-length"/>
+
+ <xsl:map-entry key="current-grouping-key()" select="'p', position(), $p-length"/>
+
+ <xsl:message select="'p: ', $current-p || ' pos:' || position(), current-group()"/>
+
+ <xsl:for-each-group select="current-group()" group-by="map:get($map_w, .)[2]">
+ <xsl:sort select="xs:integer(substring-before(substring-after(., 'segm_'), '.'))"
+ order="ascending"/>
+ <xsl:sort select="xs:integer(substring-before(substring-after(., '.'), '-'))"
+ order="ascending"/>
+ <xsl:variable name="current-s" select="fn:current-grouping-key()" as="xs:string"/> <!-\-xs:untypedAtomic-\->
+ <xsl:variable name="s-length" select="
+ sum(for $id in current-group()
+ return
+ string-length(map:get($map_w, $id)[5])) + count(current-group()) - xs:integer($current-p-pos ne 1) -
+ count(fn:filter(current-group(), function ($w-id) {
+ map:get($map_w, $w-id)[4] eq false()
+ }))"/>
+
+
+ <xsl:map-entry key="current-grouping-key()" select="'s', position(), $s-length, $current-p"/>
+
+ <xsl:message select="'s: ', position(), current-group()"/>
+
+ <xsl:for-each select="current-group()">
+ <xsl:sort select="map:get($map_w, .)[3]" order="ascending"/>
+ <xsl:map-entry key="." select="'w', position(), string-length(map:get($map_w, .)[5]), $current-s, map:get($map_w, .)[4]"/>
+ <!-\- <xsl:message select="map:get($map_w, .)[5]"/>-\->
+ </xsl:for-each>
+ </xsl:for-each-group>
+ </xsl:for-each-group>
+ </xsl:map>
+ </xsl:variable>-->
+
+ <!--<xsl:variable name="offsets-p" as="map(xs:untypedAtomic,item()+)">
+ <xsl:map>
+ <xsl:for-each select="
+ fn:filter(map:keys($map_p-s-w), function ($ele) {
+ map:get($map_p-s-w, $ele)[1] eq 'p'
+ })">
+ <xsl:sort select="map:get($map_p-s-w, .)[2]"/>
+
+ <xsl:map-entry key="." select="map:get($map_p-s-w, .)[2]"/>
+
+
+ </xsl:for-each>
+ </xsl:map>
+ </xsl:variable>-->
+
+
+
+
+ <!-- <xsl:message select="'map_w size: ' || map:size($map_w)"/>
+ <xsl:message select="'map_s-p size: ' || map:size($map_p-s-w)"/>
+ <xsl:message select="'offsets size: ' || map:size($offsets-p)"/>-->
+
<xsl:call-template name="create_data">
<!--<xsl:with-param name="text.xml" select="$text.xml" as="document-node()"/>-->
@@ -199,6 +335,7 @@
as="document-node()"/>
<xsl:with-param name="target" select="$targetBaseDir || '/struct/structure.xml'" as="xs:string"
/>
+ <!--<xsl:with-param name="offsets" select="$offsets" as="map(xs:string, xs:integer+)"/>-->
</xsl:call-template>
<!-- <xsl:call-template name="create_morpho">
@@ -211,11 +348,13 @@
<xsl:with-param name="target" select="$targetBaseDir || '/nkjp/morpho.xml'" as="xs:string"/>
</xsl:call-template>
-->
- <xsl:call-template name="create_text_header">
+ <!--<xsl:call-template name="create_text_header">
<xsl:with-param name="text.xml" select="$text.xml" as="document-node()"/>
<xsl:with-param name="compoundID" select="$compoundID" as="xs:string"/>
<xsl:with-param name="target" select="$targetBaseDir || '/header.xml'" as="xs:string"/>
- </xsl:call-template>
+ </xsl:call-template>-->
+
+ <!--<xsl:message select="'size of offsets in process_single: ' || map:size($offsets)"/>-->
</xsl:template>
@@ -259,100 +398,7 @@
<xsl:param name="compoundID" as="xs:string"/>
<xsl:param name="ann_segmentation.xml" as="document-node()"/>
<xsl:param name="target" as="xs:string"/>
-
-
-<!-- map the entire document, so that the processing only takes place once, and for fast lookups
-
- MOVE THIS UP TO process_single_sample - to re-use it
-
- -->
- <xsl:variable name="map_w" as="map(xs:untypedAtomic,item()+)">
- <xsl:variable name="segs" select="$ann_segmentation.xml/tei:teiCorpus/tei:TEI/tei:text/tei:body/tei:p/tei:s//tei:seg[count(@nkjp:rejected) eq 0]" as="element()+"/>
- <xsl:map>
- <xsl:for-each-group select="$segs" group-by="ancestor::tei:p[1]/@xml:id">
- <xsl:variable name="current-p" select="current-grouping-key()"/>
- <xsl:for-each-group select="current-group()" group-by="ancestor::tei:s[1]/@xml:id">
- <xsl:variable name="current-s" select="current-grouping-key()"/>
- <xsl:for-each select="current-group()">
- <xsl:map-entry key="@xml:id" select="$current-p, $current-s, position(), f:is_preceded_by_ws(.), normalize-space(tei:w)"/>
- </xsl:for-each>
- </xsl:for-each-group>
- </xsl:for-each-group>
- </xsl:map>
- </xsl:variable>
-
- <xsl:variable name="map_s-p" as="map(xs:untypedAtomic,item()+)">
- <xsl:map>
- <xsl:for-each-group select="map:keys($map_w)" group-by="map:get($map_w, .)[1]">
- <xsl:sort select="xs:integer(substring-before(substring-after(., 'segm_'), '.'))"
- order="ascending"/>
- <xsl:sort select="xs:integer(substring-before(substring-after(., '.'), '-'))"
- order="ascending"/>
- <xsl:variable name="current-p-pos" select="fn:position()" as="xs:integer"/>
- <!-- the above is used in the sentence loop, when we check if it's text-initial -->
- <xsl:variable name="current-p" select="fn:current-grouping-key()" as="xs:string"/> <!--xs:untypedAtomic-->
- <xsl:variable name="p-length" select="
- sum(for $id in current-group()
- return
- string-length(map:get($map_w, $id)[5])) + count(current-group()) - xs:integer(position() ne 1) -
- count(fn:filter(current-group(), function ($w-id) {
- map:get($map_w, $w-id)[4] eq false()
- }))"/>
- <!-- The general algorithm is:
- * count and sum the lengths of all the words
- * add 'whitespace' for all of them (= count them and add that), and then
- * subtract whitespace for those of them that are not actually preceded by it
- and if the 1st word is_preceded_by_ws then subtract 1
- because identifying that 1st word would require an extra step, we're taking a shortcut via position() -
- and that strongly depends on the presence of the xsl:sort instructions -->
-
- <xsl:message select="'sum: ' || sum( for $id in current-group() return string-length(map:get($map_w, $id)[5]) )"/>
- <!--<xsl:message select="for $id in current-group() return (string-length(map:get($map_w, $id)[5]),map:get($map_w, $id)[4] )"/>-->
- <xsl:message select="'cur-group count: ' || count(fn:current-group())"/>
- <!--<xsl:message select="fn:for-each(current-group(), function($w-id) { map:get($map_w,$w-id)[4] eq false() } )"></xsl:message>-->
- <xsl:message select="'subtract:' || count(fn:filter(current-group(), function($w-id) { map:get($map_w,$w-id)[4] eq false() } ))"></xsl:message>
- <xsl:message select="'position: ' || position() || ', xs:integer(position() ne 1)=' || xs:integer(position() ne 1)"></xsl:message>
- <xsl:message select="'p-length: ' || $p-length"/>
-
- <xsl:map-entry key="current-grouping-key()" select="'p', position(), $p-length"/>
-
- <xsl:message select="'p: ', $current-p || ' pos:' || position(), current-group()"/>
-
- <xsl:for-each-group select="current-group()" group-by="map:get($map_w, .)[2]">
- <xsl:sort select="xs:integer(substring-before(substring-after(., 'segm_'), '.'))"
- order="ascending"/>
- <xsl:sort select="xs:integer(substring-before(substring-after(., '.'), '-'))"
- order="ascending"/>
- <xsl:variable name="current-s" select="fn:current-grouping-key()" as="xs:string"/> <!--xs:untypedAtomic-->
- <xsl:variable name="s-length" select="
- sum(for $id in current-group()
- return
- string-length(map:get($map_w, $id)[5])) + count(current-group()) - xs:integer($current-p-pos ne 1) -
- count(fn:filter(current-group(), function ($w-id) {
- map:get($map_w, $w-id)[4] eq false()
- }))"/>
-
-
- <xsl:map-entry key="current-grouping-key()" select="'s', position(), $s-length, $current-p"/>
-
- <xsl:message select="'s: ', position(), current-group()"/>
-
- <xsl:for-each select="current-group()">
- <xsl:sort select="map:get($map_w, .)[3]" order="ascending"/>
- <xsl:map-entry key="." select="'w', position(), string-length(map:get($map_w, .)[5]), $current-s, map:get($map_w, .)[4]"/>
-<!-- <xsl:message select="map:get($map_w, .)[5]"/>-->
- </xsl:for-each>
-
- </xsl:for-each-group>
-
-
-
- </xsl:for-each-group>
- </xsl:map>
- </xsl:variable>
-
- <xsl:message select="'map_w size: ' || map:size($map_w)"/>
- <xsl:message select="'map_s-p size: ' || map:size($map_s-p)"/>
+ <!--<xsl:param name="offsets" as="map(xs:string, xs:integer+)"/>-->
<xsl:result-document encoding="UTF-8" method="xml" indent="yes"
xpath-default-namespace="http://ids-mannheim.de/ns/KorAP" href="{$target}">
@@ -360,9 +406,19 @@
<xsl:element name="layer" namespace="{$KorAP_namespace}">
<xsl:attribute name="docid" select="$compoundID"/>
<xsl:attribute name="version" select="$KorAP-XML_version"/>
+
+ <!--<xsl:message select="'size of offsets in create_struct: ' || map:size($offsets)"/>-->
<xsl:element name="spanList" namespace="{$KorAP_namespace}">
- <!--<xsl:apply-templates select="$ann_segmentation.xml//tei:text" mode="struct"/> -->
+ <xsl:apply-templates select="$ann_segmentation.xml//tei:text" mode="struct">
+ <!--<xsl:with-param name="offsets" as="map(xs:string, xs:integer+)" tunnel="yes">
+ <xsl:map>
+ <xsl:for-each select="tail(fn:accumulator-after('elem-offset-seq'))">
+ <xsl:map-entry key="map:get(., 'id')" select="map:get(., 'start'), map:get(., 'end')"/>
+ </xsl:for-each>
+ </xsl:map>
+ </xsl:with-param>-->
+ </xsl:apply-templates>
</xsl:element>
</xsl:element>
</xsl:result-document>
@@ -372,6 +428,17 @@
<xsl:param name="ini" as="xs:integer" required="no" select="0"/>
<xsl:param name="fin" as="xs:integer" required="no" select="999999999"/>
<xsl:param name="index" as="xs:integer" required="no" select="1"/>
+ <xsl:param name="offsets" as="map(xs:string, xs:integer+)" tunnel="yes"/>
+
+ <xsl:variable name="offsets" as="map(xs:string, xs:integer+)">
+ <xsl:map>
+ <xsl:for-each select="tail(fn:accumulator-after('elem-offset-seq'))">
+ <xsl:map-entry key="map:get(., 'id')" select="map:get(., 'start'), map:get(., 'end')"/>
+ </xsl:for-each>
+ </xsl:map>
+ </xsl:variable>
+
+<!-- <xsl:message select="'size of offsets in tei:* ' || map:size($offsets)"/>-->
<!-- It's so spread out because I want to make sure to be able to look up the individual
constituent values, should anything go wrong; optimization will come when it's worked against a larger dataset -->
@@ -429,16 +496,22 @@
<xsl:variable name="numbers" select="substring-after(substring-before(@corresp,')'),',')"/>
<xsl:sequence select="xs:integer(substring-before($numbers,','))"/>
</xsl:when>
+ <xsl:otherwise>
+ <xsl:message terminate="yes" select="'Element not handled: ' || fn:local-name()"/>
+ </xsl:otherwise>
</xsl:choose>
</xsl:variable>
<xsl:variable name="end" as="xs:integer" select="f:calc_content_length(.)">
</xsl:variable>
+
+
<xsl:element name="span" namespace="{$KorAP_namespace}">
<xsl:attribute name="id" select="'s' || $my_index"/>
<xsl:attribute name="from" select="$start"/>
<xsl:attribute name="to" select="$end"/>
+ <xsl:attribute name="accumulator" select="string-join(map:get($offsets,string(@xml:id)))"/>
<xsl:attribute name="l" select="f:compute_nesting(.)"/>
<xsl:element name="fs" namespace="http://www.tei-c.org/ns/1.0">
<xsl:attribute name="type" select="'struct'"></xsl:attribute> <!-- STRUCT vs. LEX -->
@@ -1061,3 +1134,5 @@
</xsl:stylesheet>
+
+<!--<xsl:message select="('map:',serialize($map, map{'method':'adaptive'}))"/>-->
\ No newline at end of file