Use stream moded when possible in secondary XSLT passes
This prevents errors with text nodes > 2GB.
Thanks @luengen !
diff --git a/xslt/pass2.xsl b/xslt/pass2.xsl
index 13cd23a..a919fc6 100644
--- a/xslt/pass2.xsl
+++ b/xslt/pass2.xsl
@@ -11,52 +11,59 @@
<xsl:variable name="domainClassifier" select="TextClassifier:new('models/dereko_domains_s.classifier')"/>
- <xsl:mode on-no-match="shallow-copy"/>
+ <xsl:mode streamable="yes" on-no-match="shallow-copy"/>
+ <xsl:mode streamable="no" on-no-match="shallow-copy" name="no-Streaming"/>
- <xsl:template match="textClass">
- <xsl:variable name="classification" select="tokenize(TextClassifier:topicDomainsFromText($domainClassifier, ../../../text), ';')"/>
+ <xsl:template match="idsText">
+ <xsl:variable name="idsText" as="node()">
+ <xsl:copy-of select="."/>
+ </xsl:variable>
+ <xsl:apply-templates select="$idsText" mode="no-Streaming"/>
+ </xsl:template>
+
+ <xsl:template match="textClass" mode="no-Streaming">
<textClass>
- <catRef n="{$classification[1]}" target="{$classification[2]}" scheme="topic"/>
- <xsl:if test="xs:decimal($classification[3]) > 0.0000001">
- <catRef n="{$classification[3]}" target="{$classification[4]}" scheme="topic"/>
- </xsl:if>
+ <xsl:text>
</xsl:text>
+ <xsl:value-of disable-output-escaping="yes"
+ select="TextClassifier:insertCatRefs($domainClassifier, 'topic', ../../../text, 0.0001)"/>
+ <xsl:apply-templates mode="no-Streaming"/>
</textClass>
</xsl:template>
- <xsl:template match="p[not(normalize-space())]" priority="1.0"/>
+ <xsl:template match="p[not(normalize-space())]" priority="1.0" mode="no-Streaming"/>
- <xsl:template match="div[not(normalize-space())]" priority="1.0"/>
+ <xsl:template match="div[not(normalize-space())]" priority="1.0" mode="no-Streaming"/>
- <xsl:template match="p[descendant::div|descendant::p and not(ancestor::item)]" priority="0.9">
+ <xsl:template match="p[descendant::div|descendant::p and not(ancestor::item)]" priority="0.9" mode="no-Streaming">
<div type="section">
- <xsl:apply-templates/>
+ <xsl:apply-templates mode="no-Streaming"/>
</div>
</xsl:template>
- <xsl:template match="(ref|emph|hi|text())[parent::div]" priority="0.9">
+ <xsl:template match="(ref|emph|hi|text())[parent::div]" priority="0.9" mode="no-Streaming">
<p>
- <xsl:apply-templates/>
+ <xsl:apply-templates mode="no-Streaming"/>
</p>
</xsl:template>
- <xsl:template match="head[parent::p]">
+ <xsl:template match="head[parent::p]" mode="no-Streaming">
<hi rend="bold">
<xsl:value-of select="."/>
</hi>
</xsl:template>
- <xsl:template match="hi[parent::div]">
+ <xsl:template match="hi[parent::div]" mode="no-Streaming">
<p>
- <xsl:apply-templates/>
+ <xsl:apply-templates mode="no-Streaming"/>
</p>
</xsl:template>
- <xsl:template match="div[ancestor::item]">
+ <xsl:template match="div[ancestor::item]" mode="no-Streaming">
<p>
- <xsl:apply-templates/>
+ <xsl:apply-templates mode="no-Streaming"/>
</p>
</xsl:template>
- <xsl:template match="p[normalize-space(.) = ' ']"/>
+ <xsl:template match="p[normalize-space(.) = ' ']" mode="no-Streaming"/>
</xsl:stylesheet>
diff --git a/xslt/pass3.xsl b/xslt/pass3.xsl
index f54bdca..4a0e002 100644
--- a/xslt/pass3.xsl
+++ b/xslt/pass3.xsl
@@ -7,34 +7,44 @@
doctype-system="http://corpora.ids-mannheim.de/I5/DTD/i5.dtd"
/>
- <xsl:mode on-no-match="shallow-copy"/>
+ <xsl:mode streamable="yes" on-no-match="shallow-copy"/>
+ <xsl:mode streamable="no" on-no-match="shallow-copy" name="no-Streaming"/>
+ <!-- <xsl:template match="idsDoc[not(normalize-space(.//body))]"/> -->
- <xsl:template match="idsDoc[not(normalize-space(.//body))]"/>
-
- <xsl:template match="idsText[not(normalize-space(.//body))]"/>
- <xsl:template match="hi[parent::div]" priority="0.6">
+ <!-- <xsl:template match="idsText[not(normalize-space(.//body))]"/>-->
+
+ <xsl:template match="idsText">
+ <xsl:variable name="idsText" as="node()">
+ <xsl:copy-of select="."/>
+ </xsl:variable>
+ <xsl:if test="normalize-space($idsText//body)">
+ <xsl:apply-templates select="$idsText" mode="no-Streaming"/>
+ </xsl:if>
+ </xsl:template>
+
+ <xsl:template match="hi[parent::div]" priority="0.6" mode="no-Streaming">
<p>
- <xsl:apply-templates/>
+ <xsl:apply-templates mode="no-Streaming"/>
</p>
</xsl:template>
- <xsl:template match="(ref|emph|hi|text())[parent::div]" priority="0.9">
+ <xsl:template match="(ref|emph|hi|text())[parent::div]" priority="0.9" mode="no-Streaming">
<p>
- <xsl:apply-templates/>
+ <xsl:apply-templates mode="no-Streaming"/>
</p>
</xsl:template>
- <xsl:template match="p/div">
- <xsl:apply-templates/>
+ <xsl:template match="p/div" mode="no-Streaming">
+ <xsl:apply-templates mode="no-Streaming"/>
</xsl:template>
- <xsl:template match="p/p">
- <xsl:apply-templates/>
+ <xsl:template match="p/p" mode="no-Streaming">
+ <xsl:apply-templates mode="no-Streaming"/>
</xsl:template>
- <xsl:template match="hi[local-name(preceding-sibling::*[1]) = 'hi' and @rend = preceding-sibling::hi[1]/@rend]"/>
+ <xsl:template match="hi[local-name(preceding-sibling::*[1]) = 'hi' and @rend = preceding-sibling::hi[1]/@rend]" mode="no-Streaming"/>
- <xsl:template match="hi[following-sibling::hi and not(preceding-sibling::hi)]">
+ <xsl:template match="hi[following-sibling::hi and not(preceding-sibling::hi)]" mode="no-Streaming">
<xsl:variable name="rend" select="@rend"/>
<hi rend="{$rend}">
<xsl:for-each-group select="self|following-sibling::hi" group-adjacent="@rend=$rend">