blob: c3d7e7c57c15e7517ef7b387dc902eff13168b71 [file] [log] [blame]
<?xml version="1.0" encoding="UTF-8"?>
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:xs="http://www.w3.org/2001/XMLSchema" xmlns:nkjp="http://www.nkjp.pl/ns/1.0"
xmlns:tei="http://www.tei-c.org/ns/1.0" xmlns:f="func"
xmlns:fn="http://www.w3.org/2005/xpath-functions"
xmlns:xi="http://www.w3.org/2001/XInclude"
xmlns:map="http://www.w3.org/2005/xpath-functions/map" exclude-result-prefixes="xs f fn map nkjp tei xi"
version="3.0" expand-text="yes">
<!-- PARAMETERS -->
<xsl:param name="sourceDir" select="'test/resources/nkjp2korap_sample2'" as="xs:string"/>
<!-- the directory containing NKJP files, in the form of a collection of text-level dirs
(that is how we know both the $corpusID and the $docID) -->
<xsl:param name="targetDir" select="'test/output'" as="xs:string"/>
<!-- where the corpus/document/text/annotations hierarchy is going to be created -->
<xsl:param name="skip_docID" as="xs:string">
<!--<xsl:value-of select="''"/>-->
<xsl:value-of select="'HellerPodgladanie,IsakowiczZaleskiMoje,KolakowskiOco,MysliwskiKamien,WilkWilczy,ZycieWarszawy_Zycie'"/>
</xsl:param>
<!-- comma-separated list of document IDs to be skipped from processing
example: HellerPodgladanie,KOT
no functionality beyond string identity is supported
(this is just for testing) -->
<xsl:param name="SHOW_ORTH_IN_STRUCT" as="xs:boolean" select="true()"/>
<!-- for debugging structure.xml production -->
<!-- VARIABLES (= constants...) -->
<xsl:variable name="corpusID" as="xs:string" select="'NKJP'" static="yes"/>
<xsl:variable name="docID" as="xs:string" select="'NKJP'" static="yes"/>
<xsl:variable name="targetCorpusDir_slashed" select="$targetDir || '/' || $corpusID || '/'" as="xs:string"/>
<xsl:variable name="systemDoctypeI5" as="xs:string"
select="'http://corpora.ids-mannheim.de/I5/DTD/i5.dtd'" static="true"/>
<xsl:variable name="publicDoctypeI5" as="xs:string" static="true"
select="'-//IDS//DTD I5 1.0//EN'"/>
<xsl:variable name="KorAP_namespace" static="true" as="xs:string"
select="'http://ids-mannheim.de/ns/KorAP'"/>
<xsl:variable name="KorAP-XML_version" select="'KorAP-0.4'" as="xs:string" static="true"/>
<!-- this is only a bit funny -->
<xsl:variable name="collection_params" as="xs:string" static="yes"
select="'recurse=yes;validation=strip;select=text.xml;content-type=application/xml;on-error=warning;xinclude=yes'"
/>
<!-- see https://www.saxonica.com/documentation11/index.html#!sourcedocs/collections/collection-directories -->
<xsl:variable name="collection_of_text" select="collection($sourceDir || '?' || $collection_params)" as="document-node()+"/>
<!-- MODES -->
<xsl:mode name="corpus" on-no-match="deep-skip"/>
<xsl:mode name="text" on-no-match="deep-skip"/>
<xsl:mode name="header-text" on-no-match="text-only-copy"/>
<xsl:mode name="copy" on-no-match="shallow-copy"/>
<xsl:mode use-accumulators="#all"/>
<xsl:accumulator name="element-index" as="xs:integer" initial-value="0">
<xsl:accumulator-rule match="tei:*[ancestor-or-self::tei:text]" select="$value + 1" phase="start"/>
</xsl:accumulator>
<!--I think I may be able to actually merge the two accumulators, but let's see-->
<xsl:accumulator name="morpho-offsets" as="map(xs:string, item()+)+" initial-value="(map{'null':(0,0)})">
<xsl:accumulator-rule match="tei:body/tei:p" phase="start">
<xsl:variable name="preceding_index" as="xs:integer">
<xsl:variable name="the_tail" as="map(*)" select="head(reverse($value))"/>
<xsl:sequence select="map:get($the_tail,map:keys($the_tail)[1])[2]"/>
</xsl:variable>
<xsl:variable name="our_base" as="xs:integer" select="if($preceding_index eq 0) then $preceding_index else $preceding_index + 1"/>
<!-- for paragraphs, it's in either being initial or not -->
<xsl:sequence select="
$value,
map {
string(@xml:id): ($preceding_index,$our_base)
}"/>
</xsl:accumulator-rule>
<!-- this is morpho-offsets -->
<xsl:accumulator-rule match="tei:s" phase="start">
<xsl:variable name="preceding_index" as="xs:integer">
<xsl:variable name="the_tail" as="map(*)" select="head(reverse($value))"/>
<xsl:sequence select="map:get($the_tail,map:keys($the_tail)[1])[2]"/>
</xsl:variable>
<xsl:variable name="our_base" as="xs:integer" select="if($preceding_index eq 0) then $preceding_index else $preceding_index + xs:integer(f:is_preceded_by_ws(.,true()))"/>
<xsl:sequence select="
$value,
map {
string(@xml:id): ($preceding_index,$our_base)
}"/>
</xsl:accumulator-rule>
<!-- this is morpho-offsets -->
<!-- I want something that won't be matched in other layers, for efficiency - that
may allow me to merge the accumulators, eventually;
but I also want to filter out the rejected tokenization alternatives already here -->
<xsl:accumulator-rule match="tei:seg[tei:fs[@type eq 'morph' and tei:f[@name eq 'disamb']]]" phase="end">
<xsl:variable name="preceding_index" as="xs:integer">
<xsl:variable name="the_tail" as="map(*)" select="head(reverse($value))"/>
<xsl:sequence select="map:get($the_tail,map:keys($the_tail)[1])[2]"/>
</xsl:variable>
<xsl:variable name="our_base" as="xs:integer" select="$preceding_index + xs:integer(f:is_preceded_by_ws(.,true()))"/>
<xsl:sequence select="
$value,
map {
string(@xml:id): ($our_base,$our_base + string-length(tei:fs/tei:f[@name eq 'orth']/tei:string))
}"/>
</xsl:accumulator-rule>
<!-- this is morpho-offsets -->
<xsl:accumulator-rule match="tei:s" phase="end">
<xsl:variable name="preceding_index" as="xs:integer">
<xsl:variable name="the_tail" as="map(*)" select="head(reverse($value))"/>
<xsl:sequence select="map:get($the_tail,map:keys($the_tail)[1])[2]"/>
</xsl:variable>
<xsl:variable name="our_base" as="xs:integer">
<xsl:variable name="incomplete" select="map:find($value,string(@xml:id))(1)" as="xs:integer+"/>
<xsl:sequence select="$incomplete[2]"/>
</xsl:variable>
<xsl:sequence select="
$value,
map {
string(@xml:id): ($our_base,$preceding_index)
}"/>
</xsl:accumulator-rule>
<!-- this is morpho-offsets -->
<xsl:accumulator-rule match="tei:body/tei:p" phase="end">
<xsl:variable name="preceding_index" as="xs:integer">
<xsl:variable name="the_tail" as="map(*)" select="head(reverse($value))"/>
<xsl:sequence select="map:get($the_tail,map:keys($the_tail)[1])[2]"/>
</xsl:variable>
<xsl:variable name="our_base" as="xs:integer">
<xsl:variable name="incomplete" select="map:find($value,string(@xml:id))(1)" as="xs:integer+"/>
<xsl:sequence select="$incomplete[2]"/>
</xsl:variable>
<xsl:sequence select="
$value,
map {
string(@xml:id): ($our_base,$preceding_index)
}"/>
</xsl:accumulator-rule>
</xsl:accumulator>
<xsl:accumulator name="segmentation-offsets" as="map(xs:string, item()+)+" initial-value="(map{'null':(0,0)})">
<xsl:accumulator-rule match="tei:body/tei:p" phase="start">
<xsl:variable name="preceding_index" as="xs:integer">
<xsl:variable name="the_tail" as="map(*)" select="head(reverse($value))"/>
<xsl:sequence select="map:get($the_tail,map:keys($the_tail)[1])[2]"/>
</xsl:variable>
<xsl:variable name="our_base" as="xs:integer" select="if($preceding_index eq 0) then $preceding_index else $preceding_index + 1"/>
<!-- for paragraphs, it's in either being initial or not -->
<xsl:sequence select="
$value,
map {
string(@xml:id): ($preceding_index,$our_base)
}"/>
</xsl:accumulator-rule>
<xsl:accumulator-rule match="tei:s" phase="start">
<xsl:variable name="preceding_index" as="xs:integer">
<xsl:variable name="the_tail" as="map(*)" select="head(reverse($value))"/>
<xsl:sequence select="map:get($the_tail,map:keys($the_tail)[1])[2]"/>
</xsl:variable>
<xsl:variable name="our_base" as="xs:integer" select="if($preceding_index eq 0) then $preceding_index else $preceding_index + xs:integer(f:is_preceded_by_ws(.,true()))"/>
<xsl:sequence select="
$value,
map {
string(@xml:id): ($preceding_index,$our_base)
}"/>
</xsl:accumulator-rule>
<xsl:accumulator-rule match="tei:w[parent::tei:seg[count(@nkjp:rejected) eq 0]]" phase="end">
<xsl:variable name="preceding_index" as="xs:integer">
<xsl:variable name="the_tail" as="map(*)" select="head(reverse($value))"/>
<xsl:sequence select="map:get($the_tail,map:keys($the_tail)[1])[2]"/>
</xsl:variable>
<xsl:variable name="our_base" as="xs:integer" select="$preceding_index + xs:integer(f:is_preceded_by_ws(parent::tei:seg,true()))"/>
<xsl:sequence select="
$value,
map {
string(parent::tei:seg/@xml:id): ($our_base,$our_base + string-length())
}"/>
</xsl:accumulator-rule>
<xsl:accumulator-rule match="tei:s" phase="end">
<xsl:variable name="preceding_index" as="xs:integer">
<xsl:variable name="the_tail" as="map(*)" select="head(reverse($value))"/>
<xsl:sequence select="map:get($the_tail,map:keys($the_tail)[1])[2]"/>
</xsl:variable>
<xsl:variable name="our_base" as="xs:integer">
<xsl:variable name="incomplete" select="map:find($value,string(@xml:id))(1)" as="xs:integer+"/>
<xsl:sequence select="$incomplete[2]"/>
</xsl:variable>
<xsl:sequence select="
$value,
map {
string(@xml:id): ($our_base,$preceding_index)
}"/>
</xsl:accumulator-rule>
<xsl:accumulator-rule match="tei:body/tei:p" phase="end">
<xsl:variable name="preceding_index" as="xs:integer">
<xsl:variable name="the_tail" as="map(*)" select="head(reverse($value))"/>
<xsl:sequence select="map:get($the_tail,map:keys($the_tail)[1])[2]"/>
</xsl:variable>
<xsl:variable name="our_base" as="xs:integer">
<xsl:variable name="incomplete" select="map:find($value,string(@xml:id))(1)" as="xs:integer+"/>
<xsl:sequence select="$incomplete[2]"/>
</xsl:variable>
<xsl:sequence select="
$value,
map {
string(@xml:id): ($our_base,$preceding_index)
}"/>
</xsl:accumulator-rule>
<xsl:accumulator-rule match="tei:body" phase="end">
<xsl:variable name="preceding_index" as="xs:integer">
<xsl:variable name="the_tail" as="map(*)" select="head(reverse($value))"/>
<xsl:sequence select="map:get($the_tail, map:keys($the_tail)[1])[2]"/>
</xsl:variable>
<xsl:sequence select="
$value,
map {
string(@xml:id): (0, $preceding_index)
}"/>
</xsl:accumulator-rule>
<xsl:accumulator-rule match="tei:text" phase="end">
<xsl:variable name="preceding_index" as="xs:integer">
<xsl:variable name="the_tail" as="map(*)" select="head(reverse($value))"/>
<xsl:sequence select="map:get($the_tail, map:keys($the_tail)[1])[2]"/>
</xsl:variable>
<xsl:sequence select="
$value,
map {
string(@xml:id): (0, $preceding_index)
}"/>
</xsl:accumulator-rule>
</xsl:accumulator>
<!-- FUNCTIONS -->
<xsl:function name="f:compute_nesting" as="xs:integer">
<xsl:param name="node" as="element()"/>
<xsl:variable name="rel_depth"
select="count($node/ancestor-or-self::*[local-name(.) ne 'TEI'][local-name(.) ne 'teiCorpus'])"
as="xs:integer"/>
<xsl:sequence select="$rel_depth"/>
</xsl:function>
<xsl:function name="f:is_preceded_by_ws" as="xs:boolean">
<xsl:param name="node" as="element()"/>
<xsl:param name="suppress_initial" as="xs:boolean"/>
<xsl:choose>
<xsl:when test="local-name($node) eq 'seg'">
<xsl:choose>
<xsl:when test="$node/@nkjp:nps">
<xsl:sequence select="fn:false()"/>
</xsl:when>
<xsl:when test="$node/tei:fs/tei:f[@name eq 'nps']">
<!--added for traversing ann_morphosyntax-->
<xsl:sequence select="fn:false()"/>
</xsl:when>
<xsl:when
test="$node/ancestor::tei:s[count(preceding-sibling::tei:s) eq 0] and $node/ancestor::tei:p[count(preceding-sibling::tei:p) eq 0] and not($node/preceding::tei:seg[count(@nkjp:rejected) eq 0])">
<xsl:sequence select="fn:false()"/>
<!-- the otherwise very costly check for preceding segs fires only if the first two are true, so it will have minimal search space -->
</xsl:when>
<xsl:when
test="$suppress_initial and $node/ancestor::tei:s/descendant::tei:seg[count(@nkjp:rejected) eq 0][1]/@xml:id eq $node/@xml:id">
<!-- I forget how node identity works now, so let me just compare the IDs -->
<xsl:sequence select="fn:false()"/>
</xsl:when>
<xsl:when
test="$suppress_initial and $node/ancestor::tei:s/descendant::tei:seg[tei:fs/tei:f[@name eq 'disamb']][1]/@xml:id eq $node/@xml:id">
<!--added for traversing ann_morphosyntax-->
<xsl:sequence select="fn:false()"/>
</xsl:when>
<xsl:otherwise>
<xsl:sequence select="fn:true()"/>
</xsl:otherwise>
</xsl:choose>
</xsl:when>
<xsl:when test="local-name($node) eq 's'">
<xsl:choose>
<xsl:when test="exists($node/preceding-sibling::tei:s)">
<xsl:sequence select="fn:true()"/>
</xsl:when>
<xsl:otherwise>
<xsl:sequence
select="not($suppress_initial) and exists($node/ancestor::tei:p[1]/preceding-sibling::tei:p)"
/>
</xsl:otherwise>
</xsl:choose>
</xsl:when>
<xsl:when test="local-name($node) eq 'p'">
<xsl:sequence select="exists($node/preceding-sibling::tei:p)"/>
</xsl:when>
<xsl:otherwise>
<xsl:message terminate="yes"
select="'Wrong argument passed to f:is_preceded_by_ws(): ' || local-name($node) || ' Only p, s, seg are allowed.'"
/>
</xsl:otherwise>
</xsl:choose>
</xsl:function>
<!-- UTILITY TEMPLATES -->
<xsl:template match="@default" mode="#all"/>
<!-- this is to delete some auto-inserted attribute throughout -->
<xsl:template match="tei:w" mode="#all"/>
<!-- NKJP-SGJP has apparently resigned from standoff representations by adding <w> everywhere;
we reach for them, but from the level of <seg>, so we don't need to process <w> separately -->
<!-- fall-thru, skipping the potential <paren> element and filtering out the bad guys -->
<xsl:template match="tei:choice" mode="struct">
<xsl:apply-templates select="descendant::tei:seg[count(@nkjp:rejected) eq 0]" mode="struct"/>
</xsl:template>
<xsl:template match="tei:choice" mode="morpho">
<xsl:apply-templates select="descendant::tei:seg[count(@nkjp:rejected) eq 0]" mode="morpho"/>
</xsl:template>
<!-- MAIN PROCESSING -->
<xsl:template name="xsl:initial-template">
<xsl:variable name="IDs_to_skip" select="tokenize($skip_docID,',')" as="xs:string*"/>
<!-- we only want to call the template below once, and we process a random NKJP corpus file for that purpose,
because all we need is the main corpus header, and we can (should) get to that from any NKJP corpus document -->
<xsl:call-template name="create_corpus_header">
<xsl:with-param name="text.xml" select="$collection_of_text[1]" as="document-node()"/>
<xsl:with-param name="target" select="$targetCorpusDir_slashed || 'header.xml'" as="xs:string"/>
</xsl:call-template>
<xsl:for-each select="$collection_of_text">
<xsl:variable name="my_dir" as="xs:string" select="replace(base-uri(),'/text\.xml','')"/>
<xsl:variable name="my_textID" as="xs:string" select="tokenize($my_dir,'/')[last()]"/>
<xsl:variable name="ann_morphosyntax.uri" select="$my_dir || '/ann_morphosyntax.xml'" as="xs:string"/>
<xsl:variable name="ann_segmentation.uri" select="$my_dir || '/ann_segmentation.xml'" as="xs:string"/>
<xsl:variable name="ann_named.uri" select="$my_dir || '/ann_named.xml'" as="xs:string"/>
<xsl:variable name="ann_groups.uri" select="$my_dir || '/ann_groups.xml'" as="xs:string"/>
<xsl:variable name="ann_words.uri" select="$my_dir || '/ann_words.xml'" as="xs:string"/>
<xsl:choose>
<xsl:when test="$my_textID = $IDs_to_skip"/>
<!-- this is a utility step, for when we want to ignore some texts for any reason (debugging, selective update) -->
<xsl:otherwise>
<xsl:call-template name="process_single_sample">
<xsl:with-param name="text.xml" as="document-node()" select="."/>
<xsl:with-param name="ann_morphosyntax.xml" as="document-node()"
select="doc($ann_morphosyntax.uri)"/>
<xsl:with-param name="ann_segmentation.xml" as="document-node()"
select="doc($ann_segmentation.uri)"/>
<xsl:with-param name="my_textID" select="$my_textID" as="xs:string"/>
<!-- the following parameters may happen to be null -->
<xsl:with-param name="ann_named.xml" as="document-node()*"
select="if(fn:doc-available($ann_named.uri)) then doc($ann_named.uri) else ()"/>
<xsl:with-param name="ann_groups.xml" as="document-node()*"
select="if(fn:doc-available($ann_groups.uri)) then doc($ann_groups.uri) else ()"/>
<xsl:with-param name="ann_words.xml" as="document-node()*"
select="if(fn:doc-available($ann_words.uri)) then doc($ann_words.uri) else ()"/>
</xsl:call-template>
</xsl:otherwise>
</xsl:choose>
</xsl:for-each>
</xsl:template>
<xsl:template name="process_single_sample">
<xsl:param name="text.xml" as="document-node()"/>
<xsl:param name="ann_morphosyntax.xml" as="document-node()"/>
<xsl:param name="ann_segmentation.xml" as="document-node()"/>
<xsl:param name="my_textID" as="xs:string" select="'0-BAD_textID'"/>
<!-- empty textID should never happen, but if it does, it will be signalled at the top of the output -->
<xsl:param name="ann_named.xml" as="document-node()*"/>
<xsl:param name="ann_groups.xml" as="document-node()*"/>
<xsl:param name="ann_words.xml" as="document-node()*"/>
<xsl:variable name="targetBaseDir" as="xs:string" select="$targetCorpusDir_slashed || $docID || '/' || $my_textID"/>
<xsl:variable name="compoundID" as="xs:string"
select="$corpusID || '_' || $docID || '.' || $my_textID"/>
<!-- this is what occurs in the text and data layers as @docid -->
<xsl:call-template name="create_data">
<xsl:with-param name="ann_segmentation.xml" select="$ann_segmentation.xml" as="document-node()"/>
<xsl:with-param name="compoundID" select="$compoundID" as="xs:string"/>
<xsl:with-param name="target" select="$targetBaseDir || '/data.xml'" as="xs:string"/>
</xsl:call-template>
<xsl:call-template name="create_struct">
<xsl:with-param name="compoundID" select="$compoundID" as="xs:string"/>
<xsl:with-param name="ann_segmentation.xml" select="$ann_segmentation.xml"
as="document-node()"/>
<xsl:with-param name="target" select="$targetBaseDir || '/struct/structure.xml'" as="xs:string"
/>
</xsl:call-template>
<xsl:call-template name="create_morpho">
<xsl:with-param name="compoundID" select="$compoundID" as="xs:string"/>
<xsl:with-param name="ann_segmentation.xml" select="$ann_segmentation.xml"
as="document-node()"/>
<xsl:with-param name="ann_morphosyntax.xml" select="$ann_morphosyntax.xml"
as="document-node()"/>
<xsl:with-param name="target" select="$targetBaseDir || '/nkjp/morpho.xml'" as="xs:string"/>
</xsl:call-template>
<xsl:call-template name="create_text_header">
<xsl:with-param name="text.xml" select="$text.xml" as="document-node()"/>
<xsl:with-param name="compoundID" select="$compoundID" as="xs:string"/>
<xsl:with-param name="target" select="$targetBaseDir || '/header.xml'" as="xs:string"/>
</xsl:call-template>
<xsl:if test="$ann_named.xml">
<xsl:variable name="rev_lookup-seq" as="map(*)+">
<xsl:for-each
select="$ann_named.xml//tei:seg/tei:ptr">
<xsl:variable name="trg" as="xs:string" select="fn:substring-before(@target, '#')"/>
<!-- caution: as of 01-June-2022, some of the pointers are malformed (missing '#' when referencing locally).
so we need to act around it but also sustainably - in case that error gets corrected -->
<xsl:if test="fn:string-length($trg) and $trg eq 'ann_morphosyntax.xml'">
<xsl:sequence>
<xsl:map-entry key="fn:substring-after(fn:string(@target), '#')" select="fn:current()"
/>
</xsl:sequence>
</xsl:if>
</xsl:for-each>
</xsl:variable>
<xsl:variable name="rev_lookup" as="map(*)" select="map:merge($rev_lookup-seq,map{'duplicates':'combine'})"/>
<xsl:call-template name="create_named">
<xsl:with-param name="compoundID" select="$compoundID" as="xs:string"/>
<xsl:with-param name="ann_morphosyntax.xml" select="$ann_morphosyntax.xml"
as="document-node()"/>
<xsl:with-param name="ann_named.xml" select="$ann_named.xml"
as="document-node()"/>
<xsl:with-param name="target" select="$targetBaseDir || '/nkjp/named.xml'" as="xs:string"/>
<xsl:with-param name="rev_lookup" select="$rev_lookup" as="map(*)"/>
</xsl:call-template>
</xsl:if>
<xsl:if test="$ann_words.xml and $ann_groups.xml">
<xsl:call-template name="create_groups">
<xsl:with-param name="compoundID" select="$compoundID" as="xs:string"/>
<xsl:with-param name="ann_segmentation.xml" select="$ann_segmentation.xml"
as="document-node()"/>
<xsl:with-param name="ann_morphosyntax.xml" select="$ann_morphosyntax.xml"
as="document-node()"/>
<xsl:with-param name="ann_words.xml" select="$ann_words.xml"
as="document-node()"/>
<xsl:with-param name="ann_groups.xml" select="$ann_groups.xml"
as="document-node()"/>
<xsl:with-param name="target" select="$targetBaseDir || '/nkjp/groups.xml'" as="xs:string"/>
</xsl:call-template>
</xsl:if>
</xsl:template>
<!-- ************************** data.xml ******************* -->
<xsl:template name="create_data">
<xsl:param name="ann_segmentation.xml" as="document-node()"/>
<xsl:param name="compoundID" as="xs:string"/>
<xsl:param name="target" as="xs:string"/>
<!-- create the data.xml file -->
<xsl:result-document encoding="UTF-8" method="xml" indent="yes"
xpath-default-namespace="{$KorAP_namespace}" href="{$target}">
<xsl:processing-instruction name="xml-model">href=&quot;text.rng&quot; type=&quot;application/xml&quot; schematypens=&quot;http://relaxng.org/ns/structure/1.0&quot;</xsl:processing-instruction>
<xsl:element name="raw_text" namespace="{$KorAP_namespace}">
<xsl:attribute name="docid" select="$compoundID"/>
<xsl:element name="metadata" namespace="{$KorAP_namespace}">
<xsl:attribute name="file" select="'metadata.xml'"/>
</xsl:element>
<xsl:element name="text" namespace="{$KorAP_namespace}">
<xsl:variable name="content" as="xs:string+">
<xsl:for-each select="$ann_segmentation.xml/tei:teiCorpus/tei:TEI/tei:text/tei:body/tei:p/tei:s//tei:seg[count(@nkjp:rejected) eq 0]">
<xsl:sequence select="
if (f:is_preceded_by_ws(.,false())) then
' '
else
'', ./tei:w"/>
</xsl:for-each>
</xsl:variable>
<xsl:value-of select="string-join($content)"/>
</xsl:element>
</xsl:element>
</xsl:result-document>
</xsl:template>
<!-- ************************** struct ******************* -->
<xsl:template name="create_struct">
<xsl:param name="compoundID" as="xs:string"/>
<xsl:param name="ann_segmentation.xml" as="document-node()"/>
<xsl:param name="target" as="xs:string"/>
<xsl:result-document encoding="UTF-8" method="xml" indent="yes"
xpath-default-namespace="http://ids-mannheim.de/ns/KorAP" href="{$target}">
<xsl:processing-instruction name="xml-model">href=&quot;span.rng&quot; type=&quot;application/xml&quot; schematypens=&quot;http://relaxng.org/ns/structure/1.0&quot;</xsl:processing-instruction>
<xsl:element name="layer" namespace="{$KorAP_namespace}">
<xsl:attribute name="docid" select="$compoundID"/>
<xsl:attribute name="version" select="$KorAP-XML_version"/>
<xsl:element name="spanList" namespace="{$KorAP_namespace}">
<xsl:apply-templates select="$ann_segmentation.xml//tei:text" mode="struct"/>
</xsl:element>
</xsl:element>
</xsl:result-document>
</xsl:template>
<xsl:template match="tei:*" mode="struct">
<xsl:variable name="offsets" as="xs:integer+">
<xsl:sequence select="map:get(fn:accumulator-after('segmentation-offsets')[last()], string(@xml:id))"/>
</xsl:variable>
<xsl:variable name="my_name" select="local-name()" as="xs:string"/>
<xsl:variable name="my_index" select="fn:accumulator-before('element-index')" as="xs:integer"/>
<xsl:element name="span" namespace="{$KorAP_namespace}">
<xsl:attribute name="id" select="'s' || $my_index"/>
<xsl:attribute name="from" select="$offsets[1]"/>
<xsl:attribute name="to" select="$offsets[2]"/>
<xsl:attribute name="l" select="f:compute_nesting(.)"/>
<xsl:if test="local-name() eq 'seg' and $SHOW_ORTH_IN_STRUCT">
<xsl:comment><xsl:value-of select="fn:normalize-space(.)"/></xsl:comment>
</xsl:if>
<xsl:element name="fs" namespace="http://www.tei-c.org/ns/1.0">
<xsl:attribute name="type" select="'struct'"></xsl:attribute> <!-- STRUCT vs. LEX for morpho -->
<xsl:element name="f" namespace="http://www.tei-c.org/ns/1.0">
<xsl:attribute name="name" select="'name'"/>
<xsl:value-of select="local-name()"/>
</xsl:element>
<xsl:if test="count(@*)">
<xsl:element name="f" namespace="http://www.tei-c.org/ns/1.0">
<xsl:attribute name="name" select="'attr'"/>
<xsl:element name="fs" namespace="http://www.tei-c.org/ns/1.0">
<xsl:attribute name="type" select="'attr'"/>
<xsl:for-each select="@*">
<xsl:element name="f" namespace="http://www.tei-c.org/ns/1.0">
<xsl:attribute name="name" select="local-name(.)"/>
<xsl:value-of select="."/>
</xsl:element>
</xsl:for-each>
</xsl:element>
</xsl:element>
</xsl:if>
</xsl:element>
</xsl:element>
<xsl:apply-templates mode="struct"/>
</xsl:template>
<!-- ************************** morpho ******************* -->
<xsl:template name="create_morpho">
<xsl:param name="compoundID" as="xs:string"/>
<xsl:param name="ann_segmentation.xml" as="document-node()"/>
<xsl:param name="ann_morphosyntax.xml" as="document-node()"/>
<xsl:param name="target" as="xs:string"/>
<xsl:result-document encoding="UTF-8" method="xml" indent="yes"
xpath-default-namespace="http://ids-mannheim.de/ns/KorAP" href="{$target}">
<xsl:processing-instruction name="xml-model">href=&quot;span.rng&quot; type=&quot;application/xml&quot; schematypens=&quot;http://relaxng.org/ns/structure/1.0&quot;</xsl:processing-instruction>
<xsl:element name="layer" namespace="{$KorAP_namespace}">
<xsl:attribute name="docid" select="$compoundID"/>
<xsl:attribute name="version" select="$KorAP-XML_version"/>
<xsl:element name="spanList" namespace="{$KorAP_namespace}">
<xsl:apply-templates select="$ann_segmentation.xml//tei:text" mode="morpho">
<xsl:with-param name="ann_morphosyntax.xml" select="$ann_morphosyntax.xml" as="document-node()" tunnel="yes"/>
</xsl:apply-templates>
</xsl:element>
</xsl:element>
</xsl:result-document>
</xsl:template>
<xsl:template match="tei:seg" mode="morpho">
<xsl:param name="ann_morphosyntax.xml" as="document-node()" tunnel="yes"/>
<!-- it's so spread out because I wanted to make sure to be able to look up the individual
constituent values, should anything go wrong; it might get compacted at some point, but
the increase in efficiency will probably be minimal, compared to the decrease of readability -->
<xsl:variable name="offsets" as="xs:integer+">
<xsl:sequence select="map:get(fn:accumulator-after('segmentation-offsets')[last()], string(@xml:id))"/>
</xsl:variable>
<xsl:variable name="my_name" select="local-name()" as="xs:string"/>
<xsl:variable name="my_id" select="@xml:id" as="xs:string"/>
<xsl:variable name="my_morph-seg" as="node()" select="$ann_morphosyntax.xml//tei:seg[substring-after(@corresp,'#') eq $my_id]"/>
<xsl:variable name="my_disamb" select="$my_morph-seg//tei:fs/tei:f[@name eq 'disamb']" as="node()"/>
<xsl:variable name="my_choice-id" select="substring-after($my_disamb//tei:f[@name eq 'choice']/@fVal,'#')" as="xs:string"/>
<xsl:variable name="my_choice-lex" select="$my_morph-seg//tei:f[@name eq 'interps']/tei:fs[@type eq 'lex'][descendant::tei:symbol[@xml:id eq $my_choice-id]]" as="node()"/>
<xsl:variable name="chosen-msd" as="xs:string" select="$my_choice-lex/descendant::tei:symbol[@xml:id eq $my_choice-id]/@value"/>
<xsl:variable name="my_index" select="fn:accumulator-before('element-index')" as="xs:integer"/>
<xsl:element name="span" namespace="{$KorAP_namespace}">
<xsl:attribute name="id" select="'m' || $my_index"/>
<xsl:attribute name="from" select="$offsets[1]"/>
<xsl:attribute name="to" select="$offsets[2]"/>
<xsl:attribute name="l" select="f:compute_nesting(.)"/>
<xsl:element name="fs" namespace="http://www.tei-c.org/ns/1.0">
<xsl:attribute name="type" select="'lex'"/>
<xsl:element name="f" namespace="http://www.tei-c.org/ns/1.0">
<xsl:attribute name="name" select="'lex'"/>
<xsl:comment select="(if(@nkjp:nps) then ' ' else '_') || $my_morph-seg//tei:fs/tei:f[@name eq 'orth']/tei:string"/>
<xsl:element name="fs" namespace="http://www.tei-c.org/ns/1.0">
<xsl:element name="f" namespace="http://www.tei-c.org/ns/1.0">
<xsl:attribute name="name" select="'lemma'"/>
<xsl:value-of select="$my_choice-lex/tei:f[@name eq 'base']/tei:string"/>
</xsl:element>
<xsl:element name="f" namespace="http://www.tei-c.org/ns/1.0">
<xsl:attribute name="name" select="'pos'"/>
<xsl:value-of select="$my_choice-lex/tei:f[@name eq 'ctag']/tei:symbol/@value"/>
</xsl:element>
<xsl:if test="string-length($chosen-msd)">
<xsl:element name="f" namespace="http://www.tei-c.org/ns/1.0">
<xsl:attribute name="name" select="'msd'"/>
<xsl:value-of select="$chosen-msd"/>
</xsl:element>
</xsl:if>
<xsl:if test="$my_morph-seg//tei:fs/tei:f[@name eq 'nps']">
<xsl:element name="f" namespace="http://www.tei-c.org/ns/1.0">
<xsl:attribute name="name" select="'join'"/>
<xsl:value-of select="'left'"/>
</xsl:element>
</xsl:if>
</xsl:element>
</xsl:element>
</xsl:element>
</xsl:element>
</xsl:template>
<!-- ************************** named entities ******************* -->
<xsl:template name="create_named">
<xsl:param name="compoundID" as="xs:string"/>
<xsl:param name="ann_morphosyntax.xml" as="document-node()"/>
<xsl:param name="ann_named.xml" as="document-node()"/>
<xsl:param name="target" as="xs:string"/>
<xsl:param name="rev_lookup" as="map(*)"/>
<xsl:result-document encoding="UTF-8" method="xml" indent="yes"
xpath-default-namespace="http://ids-mannheim.de/ns/KorAP" href="{$target}">
<xsl:processing-instruction name="xml-model">href=&quot;span.rng&quot; type=&quot;application/xml&quot; schematypens=&quot;http://relaxng.org/ns/structure/1.0&quot;</xsl:processing-instruction>
<xsl:element name="layer" namespace="{$KorAP_namespace}">
<xsl:attribute name="docid" select="$compoundID"/>
<xsl:attribute name="version" select="$KorAP-XML_version"/>
<xsl:element name="spanList" namespace="{$KorAP_namespace}">
<xsl:apply-templates select="$ann_morphosyntax.xml//tei:text" mode="named">
<xsl:with-param name="ann_named.xml" select="$ann_named.xml" as="document-node()" tunnel="yes"/>
<xsl:with-param name="rev_lookup" select="$rev_lookup" as="map(*)" tunnel="yes"/>
</xsl:apply-templates>
</xsl:element>
</xsl:element>
</xsl:result-document>
</xsl:template>
<xsl:template match="tei:seg" mode="named"/>
<xsl:template match="tei:seg[tei:fs[tei:f[@name eq 'disamb']]]" mode="named">
<xsl:param name="ann_named.xml" as="document-node()" tunnel="yes"/>
<xsl:param name="rev_lookup" as="map(*)" tunnel="yes"/>
<xsl:variable name="offsets" as="xs:integer+">
<xsl:sequence select="map:get(fn:accumulator-after('morpho-offsets')[last()], string(@xml:id))"/>
</xsl:variable>
<xsl:variable name="ptr" select="map:get($rev_lookup,fn:string(@xml:id))" as="element(tei:ptr)*"/>
<!-- it's an element, because we need to see where it stands in a sequence... -->
<xsl:if test="$ptr">
<xsl:variable name="my_id" select="@xml:id" as="xs:string"/>
<xsl:variable name="my_index" select="fn:accumulator-before('element-index')" as="xs:integer"/>
<xsl:element name="span" namespace="{$KorAP_namespace}">
<xsl:attribute name="id" select="'n' || $my_index"/>
<xsl:attribute name="from" select="$offsets[1]"/>
<xsl:attribute name="to" select="$offsets[2]"/>
<xsl:attribute name="l" select="f:compute_nesting(.)"/>
<xsl:element name="fs" namespace="http://www.tei-c.org/ns/1.0">
<xsl:attribute name="type" select="'ne'"/>
<xsl:element name="f" namespace="http://www.tei-c.org/ns/1.0">
<xsl:attribute name="name" select="'ne'"/>
<xsl:comment select="
(if (tei:fs/tei:f[@name eq 'nps']) then
' '
else
'_') || tei:fs/tei:f[@name eq 'orth']/tei:string"/>
<xsl:element name="fs" namespace="http://www.tei-c.org/ns/1.0">
<xsl:element name="f" namespace="http://www.tei-c.org/ns/1.0">
<xsl:attribute name="name" select="'complex-ent'"/>
<xsl:for-each select="$ptr">
<xsl:element name="fs" namespace="http://www.tei-c.org/ns/1.0">
<xsl:attribute name="type" select="'complex-ent'"/>
<xsl:for-each select="parent::tei:seg/tei:fs[1]/tei:f">
<xsl:if test="@name eq 'type' or @name eq 'subtype'">
<xsl:copy-of select="." copy-namespaces="no"/>
</xsl:if>
</xsl:for-each>
<xsl:element name="f" namespace="http://www.tei-c.org/ns/1.0">
<xsl:attribute name="name" select="'nkjp-named'"/>
<xsl:copy-of select="parent::tei:seg/tei:fs[1]" copy-namespaces="no"/>
</xsl:element>
</xsl:element>
</xsl:for-each>
</xsl:element>
</xsl:element>
</xsl:element>
</xsl:element>
</xsl:element>
</xsl:if>
</xsl:template>
<!-- ************************** syntactic chunks ******************* -->
<xsl:template name="create_groups">
<xsl:param name="compoundID" as="xs:string"/>
<xsl:param name="ann_segmentation.xml" as="document-node()"/>
<xsl:param name="ann_morphosyntax.xml" as="document-node()"/>
<xsl:param name="ann_words.xml" as="document-node()"/>
<xsl:param name="ann_groups.xml" as="document-node()"/>
<xsl:param name="target" as="xs:string"/>
<xsl:result-document encoding="UTF-8" method="xml" indent="yes"
xpath-default-namespace="http://ids-mannheim.de/ns/KorAP" href="{$target}">
<xsl:processing-instruction name="xml-model">href=&quot;span.rng&quot; type=&quot;application/xml&quot; schematypens=&quot;http://relaxng.org/ns/structure/1.0&quot;</xsl:processing-instruction>
<xsl:element name="layer" namespace="{$KorAP_namespace}">
<xsl:attribute name="docid" select="$compoundID"/>
<xsl:attribute name="version" select="$KorAP-XML_version"/>
<xsl:element name="spanList" namespace="{$KorAP_namespace}">
<!--<xsl:apply-templates select="$ann_segmentation.xml//tei:text" mode="groups">
<xsl:with-param name="ann_morphosyntax.xml" select="$ann_morphosyntax.xml" as="document-node()" tunnel="yes"/>
<xsl:with-param name="ann_words.xml" select="$ann_words.xml" as="document-node()" tunnel="yes"/>
<xsl:with-param name="ann_groups.xml" select="$ann_groups.xml" as="document-node()" tunnel="yes"/>
</xsl:apply-templates>-->
</xsl:element>
</xsl:element>
</xsl:result-document>
</xsl:template>
<!-- ************************** TEXT header ******************* -->
<xsl:template name="create_text_header">
<xsl:param name="text.xml" as="document-node()"/>
<xsl:param name="compoundID" as="xs:string"/>
<xsl:param name="target" as="xs:string"/>
<!-- create the local header.xml file -->
<xsl:result-document encoding="UTF-8" method="xml" indent="yes"
xpath-default-namespace="http://ids-mannheim.de/ns/KorAP" href="{$target}">
<idsHeader type="document" pattern="text" status="new" version="1.1" TEIform="teiHeader">
<xsl:apply-templates select="$text.xml//tei:TEI/tei:teiHeader/tei:*" mode="text">
<xsl:with-param name="compoundID" as="xs:string" select="$compoundID" tunnel="yes"/>
</xsl:apply-templates>
</idsHeader>
</xsl:result-document>
</xsl:template>
<xsl:template match="tei:fileDesc" mode="text">
<xsl:element name="{local-name()}">
<xsl:apply-templates mode="text"/>
</xsl:element>
</xsl:template>
<xsl:template match="tei:title" mode="text">
<t.title>
<xsl:apply-templates/>
</t.title>
</xsl:template>
<xsl:template match="tei:titleStmt" mode="text">
<xsl:param name="compoundID" as="xs:string" tunnel="yes"/>
<titleStmt>
<textSigle>
<xsl:value-of select="$compoundID"/>
</textSigle>
<xsl:apply-templates mode="text"/>
</titleStmt>
</xsl:template>
<xsl:template match="tei:publicationStmt" mode="text">
<xsl:element name="{local-name()}">
<xsl:apply-templates mode="text"/>
</xsl:element>
</xsl:template>
<xsl:template match="tei:availability" mode="text">
<xsl:element name="{local-name()}">
<xsl:apply-templates mode="text" select="@* | *"/>
</xsl:element>
</xsl:template>
<xsl:template match="tei:profileDesc" mode="text">
<xsl:element name="{local-name()}">
<xsl:apply-templates mode="text"/>
</xsl:element>
</xsl:template>
<xsl:template match="tei:textClass" mode="text">
<xsl:element name="{local-name()}">
<xsl:apply-templates mode="text" select="@* | *"/>
</xsl:element>
</xsl:template>
<xsl:template match="tei:catRef" mode="text corpus">
<xsl:element name="{local-name()}">
<xsl:apply-templates mode="text" select="@* | *"/>
</xsl:element>
</xsl:template>
<xsl:template match="@status | @scheme | @target | @type | @xml:id[ancestor::tei:classDecl] | @xml:lang" mode="text corpus">
<xsl:copy-of select="."/>
</xsl:template>
<xsl:template match="tei:p" mode="text corpus">
<xsl:element name="{local-name()}">
<xsl:apply-templates mode="header-text"/>
</xsl:element>
</xsl:template>
<!-- OPTIMIZATION has to take modes into account -->
<!-- ************************** CORPUS header ******************* -->
<xsl:template name="create_corpus_header">
<xsl:param name="text.xml" as="document-node()"/>
<xsl:param name="target" as="xs:string"/>
<!-- create the corpus-level header.xml file -->
<xsl:result-document encoding="UTF-8" method="xml" indent="yes" href="{$target}">
<!--doctype-public="{$publicDoctypeI5}"
doctype-system="{$systemDoctypeI5}">
these are, sadly, useless
-->
<idsHeader type="corpus" pattern="text" status="new" version="1.1" TEIform="teiHeader">
<xsl:apply-templates select="$text.xml/tei:teiCorpus/tei:teiHeader/tei:*" mode="corpus"/>
</idsHeader>
</xsl:result-document>
</xsl:template>
<xsl:template match="tei:fileDesc" mode="corpus">
<xsl:element name="{local-name()}">
<xsl:apply-templates mode="corpus"/>
</xsl:element>
</xsl:template>
<xsl:template match="tei:title" mode="corpus">
<c.title>
<xsl:apply-templates mode="corpus" select="@*"/>
<xsl:apply-templates mode="header-text"/>
</c.title>
</xsl:template>
<xsl:template match="tei:titleStmt" mode="corpus">
<titleStmt>
<korpusSigle>
<xsl:value-of select="$corpusID"/>
</korpusSigle>
<xsl:apply-templates mode="corpus"/>
</titleStmt>
</xsl:template>
<xsl:template match="tei:publicationStmt" mode="corpus">
<xsl:element name="{local-name()}">
<xsl:apply-templates mode="corpus"/>
</xsl:element>
</xsl:template>
<xsl:template match="tei:availability" mode="corpus">
<xsl:element name="{local-name()}">
<xsl:apply-templates mode="corpus" select="@* | *"/>
</xsl:element>
</xsl:template>
<xsl:template match="tei:encodingDesc" mode="corpus">
<xsl:element name="{local-name()}">
<xsl:apply-templates mode="corpus"/>
</xsl:element>
</xsl:template>
<xsl:template match="tei:classDecl | tei:taxonomy | tei:category | tei:taxonomy/tei:bibl" mode="corpus">
<xsl:element name="{local-name()}">
<xsl:apply-templates mode="corpus" select="@* | *"/>
</xsl:element>
</xsl:template>
<xsl:template match="tei:bibl/tei:title | tei:edition | tei:desc" mode="corpus">
<xsl:element name="{local-name()}">
<xsl:apply-templates mode="corpus" select="@*"/>
<xsl:apply-templates mode="header-text"/>
</xsl:element>
</xsl:template>
<!--
<xsl:template match="tei:textClass" mode="corpus">
<xsl:element name="{local-name()}">
<xsl:apply-templates mode="corpus" select="@* | *"/>
</xsl:element>
</xsl:template>
<xsl:template match="tei:catRef" mode="corpus">
<xsl:element name="{local-name()}">
<xsl:apply-templates mode="corpus" select="@* | *"/>
</xsl:element>
</xsl:template>
-->
<!-- this template can be called by the XSPEC test; TODO: find a way to call the main() template directly -->
<!-- I have not fully handled the param transmission, which would have to be kludged in just for the sake of XSPec,
because I'm disabling this for now, due to XSpec design issues; relevant links, a.o.:
https://stackoverflow.com/questions/64933277/what-is-the-cause-of-error-cannot-execute-xslresult-document-while-evaluating
https://www.balisage.net/Proceedings/vol25/html/Galtman01/BalisageVol25-Galtman01.html
In short: the internal design of XSpec forces kludges when one wants to use xsl:result-document in their stylesheets. But I don't
want to be strangled by kludges at the beginning of work, I've already lost quite a bit of time on this investigation,
I will therefore "just code" and then can think of externalizing bits of templates if we want to play with tests. For now,
I don't want to have to handle context items is a special way inside variables, etc., because I'm not sure it's worth it.
-->
<!--<xsl:template name="test_full">
<xsl:param name="corpusID"/>
<xsl:param name="docID"/>
<xsl:param name="textID"/>
<xsl:call-template name="xsl:initial-template"/>
</xsl:template>-->
</xsl:stylesheet>
<!-- template for serializing maps in messages <xsl:message select="('map:',serialize($map, map{'method':'adaptive'}))"/> -->