blob: 62cb4367622211e2d3b5cd07af0c9d3c4a33b86b [file] [log] [blame]
Akron9a8ee3e2022-01-31 13:51:49 +01001<?xml version="1.0" encoding="UTF-8"?>
2<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
bansp5e2d1c02022-03-10 04:51:40 +01003 xmlns:xs="http://www.w3.org/2001/XMLSchema" xmlns:nkjp="http://www.nkjp.pl/ns/1.0"
4 xmlns:tei="http://www.tei-c.org/ns/1.0" xmlns:f="func"
Piotr Banskif8af3a92022-05-23 03:20:10 +02005 xmlns:fn="http://www.w3.org/2005/xpath-functions"
Piotr Banski6a4a2522022-05-24 01:16:47 +02006 xmlns:map="http://www.w3.org/2005/xpath-functions/map" exclude-result-prefixes="xs f fn map nkjp tei"
bansp5e2d1c02022-03-10 04:51:40 +01007 version="3.0" expand-text="yes">
Akron9a8ee3e2022-01-31 13:51:49 +01008
banspe726b4a2022-03-28 05:47:45 +02009
10<!-- PARAMETERS -->
bansp5e2d1c02022-03-10 04:51:40 +010011
bansp8f6700b2022-03-27 05:27:09 +020012 <xsl:param name="sourceDir" select="'test/resources/nkjp2korap_sample2'" as="xs:string"/>
banspd1bf1db2022-04-04 02:16:24 +020013 <!-- the directory containing NKJP files, in the form of a collection of text-level dirs
14 (that is how we know both the $corpusID and the $docID) -->
Akron9a8ee3e2022-01-31 13:51:49 +010015
bansp8f6700b2022-03-27 05:27:09 +020016 <xsl:param name="targetDir" select="'test/output'" as="xs:string"/>
banspd1bf1db2022-04-04 02:16:24 +020017 <!-- where the corpus/document/text/annotations hierarchy is going to be created -->
banspf2b24e62022-03-28 18:12:08 +020018
19 <xsl:param name="skip_docID" as="xs:string">
banspb5992532022-03-29 15:55:44 +020020 <xsl:value-of select="'HellerPodgladanie,IsakowiczZaleskiMoje,KolakowskiOco,MysliwskiKamien,WilkWilczy,ZycieWarszawy_Zycie'"/>
21 </xsl:param>
22 <!-- comma-separated list of document IDs to be skipped from processing
banspf2b24e62022-03-28 18:12:08 +020023 example: HellerPodgladanie,KOT
banspd1bf1db2022-04-04 02:16:24 +020024 no functionality beyond string identity is supported
25 (this is just for testing) -->
banspb5992532022-03-29 15:55:44 +020026
bansp8f6700b2022-03-27 05:27:09 +020027
bansp9dc10002022-05-17 22:33:34 +020028<!-- VARIABLES (= constants...) -->
banspe726b4a2022-03-28 05:47:45 +020029
30 <xsl:variable name="corpusID" as="xs:string" select="'NKJP'" static="yes"/>
31 <xsl:variable name="docID" as="xs:string" select="'NKJP'" static="yes"/>
bansp8f6700b2022-03-27 05:27:09 +020032
33 <xsl:variable name="targetCorpusDir_slashed" select="$targetDir || '/' || $corpusID || '/'" as="xs:string"/>
bansp5e2d1c02022-03-10 04:51:40 +010034
banspd1bf1db2022-04-04 02:16:24 +020035 <xsl:variable name="systemDoctypeI5" as="xs:string"
36 select="'http://corpora.ids-mannheim.de/I5/DTD/i5.dtd'" static="true"/>
bansp5e2d1c02022-03-10 04:51:40 +010037
banspd1bf1db2022-04-04 02:16:24 +020038 <xsl:variable name="publicDoctypeI5" as="xs:string" static="true"
39 select="'-//IDS//DTD I5 1.0//EN'"/>
bansp5e2d1c02022-03-10 04:51:40 +010040
banspd1bf1db2022-04-04 02:16:24 +020041 <xsl:variable name="KorAP_namespace" static="true" as="xs:string"
42 select="'http://ids-mannheim.de/ns/KorAP'"/>
bansp5e2d1c02022-03-10 04:51:40 +010043
bansp5f841732022-03-16 06:27:31 +010044 <xsl:variable name="KorAP-XML_version" select="'KorAP-0.4'" as="xs:string" static="true"/>
45 <!-- this is only a bit funny -->
46
banspe726b4a2022-03-28 05:47:45 +020047 <xsl:variable name="collection_params" as="xs:string" static="yes"
48 select="'recurse=yes;validation=strip;select=text.xml;content-type=application/xml;on-error=warning;xinclude=yes'"
49 />
50 <!-- see https://www.saxonica.com/documentation11/index.html#!sourcedocs/collections/collection-directories -->
51
52 <xsl:variable name="collection_of_text" select="collection($sourceDir || '?' || $collection_params)" as="document-node()+"/>
banspd1bf1db2022-04-04 02:16:24 +020053
54<!-- these two 'flags' are meant to increase the readability of the code
55 they are used for the output of the calc_offsets() function, where the
Piotr Banski4f4c2d22022-05-19 01:44:32 +020056 returned value is a sequence, (start, end)
57
58 remove together with the function!
59
60 -->
banspd1bf1db2022-04-04 02:16:24 +020061 <xsl:variable name="OFFSET_START" as="xs:integer" static="yes" select="1"/>
62 <xsl:variable name="OFFSET_END" as="xs:integer" static="yes" select="2"/>
banspb5992532022-03-29 15:55:44 +020063
64
banspe726b4a2022-03-28 05:47:45 +020065<!-- MODES -->
bansp5e2d1c02022-03-10 04:51:40 +010066
67 <xsl:mode name="corpus" on-no-match="deep-skip"/>
68 <xsl:mode name="text" on-no-match="deep-skip"/>
bansp9103aab2022-03-19 05:10:21 +010069 <xsl:mode name="header-text" on-no-match="text-only-copy"/>
Piotr Banski6a4a2522022-05-24 01:16:47 +020070 <xsl:mode use-accumulators="#all"/>
71
Piotr Banski5fe4bae2022-05-24 02:40:13 +020072 <xsl:accumulator name="elem-offset-seq" as="map(xs:string, item()+)+" initial-value="(map{})">
Piotr Banski6a4a2522022-05-24 01:16:47 +020073 <xsl:accumulator-rule match="tei:w[parent::tei:seg[count(@nkjp:rejected) eq 0]]" phase="end">
74 <xsl:variable name="previous_index" as="xs:integer">
75 <xsl:choose>
76 <xsl:when test="count($value) eq 1">
77 <xsl:sequence select="0"/>
78 </xsl:when>
79 <xsl:otherwise>
Piotr Banski5fe4bae2022-05-24 02:40:13 +020080 <xsl:variable name="grab_the_tip" as="map(*)" select="head(reverse($value))"/>
81 <xsl:sequence select="map:get($grab_the_tip,map:keys($grab_the_tip)[1])[2]"/>
82 <xsl:message select="'previous element:' || map:keys($grab_the_tip)[1]"></xsl:message>
Piotr Banski6a4a2522022-05-24 01:16:47 +020083 </xsl:otherwise>
84 </xsl:choose>
85 </xsl:variable>
Piotr Banski5fe4bae2022-05-24 02:40:13 +020086 <xsl:variable name="our_base" as="xs:integer" select="$previous_index + xs:integer(f:is_preceded_by_ws(parent::tei:seg))"/>
Piotr Banski6a4a2522022-05-24 01:16:47 +020087
Piotr Banski5fe4bae2022-05-24 02:40:13 +020088 <xsl:message select="'previous_index:' || $previous_index || 'our_base: ' || $our_base"/>
Piotr Banski6a4a2522022-05-24 01:16:47 +020089
90 <xsl:sequence select="
91 $value,
92 map {
Piotr Banski69f3c5f2022-05-24 10:52:09 +020093 string(parent::tei:seg/@xml:id): ($our_base,$our_base + string-length())
Piotr Banski6a4a2522022-05-24 01:16:47 +020094 }"/>
95 </xsl:accumulator-rule>
96 </xsl:accumulator>
bansp5e2d1c02022-03-10 04:51:40 +010097
banspe726b4a2022-03-28 05:47:45 +020098
99 <!-- FUNCTIONS -->
100
bansp5f841732022-03-16 06:27:31 +0100101 <xsl:function name="f:compute_nesting" as="xs:integer">
banspd1bf1db2022-04-04 02:16:24 +0200102 <xsl:param name="node" as="element()"/>
bansp5f841732022-03-16 06:27:31 +0100103 <xsl:variable name="rel_depth"
104 select="count($node/ancestor-or-self::*[local-name(.) ne 'TEI'][local-name(.) ne 'teiCorpus'])"
105 as="xs:integer"/>
bansp5f841732022-03-16 06:27:31 +0100106 <xsl:sequence select="$rel_depth"/>
107 </xsl:function>
108
bansp9dc10002022-05-17 22:33:34 +0200109<xsl:function name="f:is_preceded_by_ws" as="xs:boolean">
110 <xsl:param name="node" as="element()"/>
111 <xsl:choose>
112 <xsl:when test="local-name($node) eq 'seg'">
Piotr Banski4f4c2d22022-05-19 01:44:32 +0200113 <xsl:sequence
114 select="not(exists($node/@nkjp:nps)) and not($node[count(preceding-sibling::tei:seg) eq 0]/ancestor::tei:s[count(preceding-sibling::tei:s) eq 0]/ancestor::tei:p[count(preceding-sibling::tei:p) eq 0])"
115 />
116 <!--and not($node/ancestor::tei:s[count(preceding-sibling::tei:s) eq 0]/ancestor::tei:p[count(preceding-sibling::tei:p) eq 0])-->
117
bansp9dc10002022-05-17 22:33:34 +0200118 </xsl:when>
119 <xsl:when test="local-name($node) eq 's'">
120 <xsl:message select="'s - prec s: ' || $node/preceding-sibling::tei:s"/>
121 <xsl:message select="'same s - prec p: ' || $node/ancestor::tei:p[1]/preceding-sibling::tei:p || '&#10;'"/>
122
123 <xsl:sequence select="exists($node/preceding-sibling::tei:s) or exists($node/ancestor::tei:p[1]/preceding-sibling::tei:p)"/>
124 </xsl:when>
125 <xsl:when test="local-name($node) eq 'p'">
126 <xsl:message select="'p : ' || $node/preceding-sibling::tei:p"></xsl:message>
127 <xsl:sequence select="exists($node/preceding-sibling::tei:p)"/>
128 </xsl:when>
129 <xsl:otherwise>
130 <xsl:message terminate="yes" select="'Wrong argument passed to f:is_preceded_by_ws(): ' || local-name($node) || ' Only p, s, seg are allowed.'"></xsl:message>
131 </xsl:otherwise>
132 </xsl:choose>
133</xsl:function>
banspd1bf1db2022-04-04 02:16:24 +0200134
banspb5992532022-03-29 15:55:44 +0200135
136<!-- UTILITY TEMPLATES -->
137
bansp9103aab2022-03-19 05:10:21 +0100138 <xsl:template match="@default" mode="#all"/>
bansp97ba7ce2022-03-26 05:14:06 +0100139 <!-- this is to delete some auto-inserted attribute throughout -->
bansp9103aab2022-03-19 05:10:21 +0100140
Piotr Banski6a4a2522022-05-24 01:16:47 +0200141 <xsl:template match="tei:w" mode="#all"/>
banspe726b4a2022-03-28 05:47:45 +0200142<!-- NKJP-SGJP has apparently resigned from standoff representations by adding <w> everywhere;
143 for the time being, we'll just stick to the standoff offsets, although that may need to
144 be revisited as the NKJP format has now began to stray from its schemas and assumptions -->
bansp8f6700b2022-03-27 05:27:09 +0200145
Piotr Banski6a4a2522022-05-24 01:16:47 +0200146 <!--<xsl:template match="tei:choice" mode="#all"/>-->
banspe726b4a2022-03-28 05:47:45 +0200147<!-- THIS IS ONLY TEMPORARY,
148 because an interesting challenge came up where I will
149 probably have to abandon straightforward mapping because of TOKENIZATION alternatives;
150
151 but now, I just want this stylesheet to work, even if it eats some occasional token (which it now does, 'komuÅ›' and 'czym' vanish)
152 -->
Piotr Banski6a4a2522022-05-24 01:16:47 +0200153
154 <!--fall-thru-->
155 <xsl:template match="tei:choice" mode="struct">
156 <xsl:apply-templates select="descendant::tei:seg"/>
157 </xsl:template>
banspb5992532022-03-29 15:55:44 +0200158
159 <!-- MAIN PROCESSING -->
160
161
bansp5e2d1c02022-03-10 04:51:40 +0100162 <xsl:template name="xsl:initial-template">
banspf2b24e62022-03-28 18:12:08 +0200163 <xsl:variable name="IDs_to_skip" select="tokenize($skip_docID,',')" as="xs:string*"/>
banspd1bf1db2022-04-04 02:16:24 +0200164
banspe726b4a2022-03-28 05:47:45 +0200165 <!-- we only want to call the template below once, and we process a random NKJP corpus file for that purpose,
bansp8f6700b2022-03-27 05:27:09 +0200166 because all we need is the main corpus header, and we can (should) get to that from any NKJP corpus document -->
167 <xsl:call-template name="create_corpus_header">
banspe726b4a2022-03-28 05:47:45 +0200168 <xsl:with-param name="text.xml" select="$collection_of_text[1]" as="document-node()"/>
bansp8f6700b2022-03-27 05:27:09 +0200169 <xsl:with-param name="target" select="$targetCorpusDir_slashed || 'header.xml'" as="xs:string"/>
170 </xsl:call-template>
171
banspe726b4a2022-03-28 05:47:45 +0200172 <xsl:for-each select="$collection_of_text">
173 <xsl:variable name="my_dir" as="xs:string" select="replace(base-uri(),'/text\.xml','')"/>
174 <xsl:variable name="my_textID" as="xs:string" select="tokenize($my_dir,'/')[last()]"/>
175 <xsl:variable name="ann_morphosyntax.uri" select="$my_dir || '/ann_morphosyntax.xml'" as="xs:string"/>
176 <xsl:variable name="ann_segmentation.uri" select="$my_dir || '/ann_segmentation.xml'" as="xs:string"/>
177
banspf2b24e62022-03-28 18:12:08 +0200178 <xsl:choose>
179 <xsl:when test="$my_textID = $IDs_to_skip"/>
bansp9dc10002022-05-17 22:33:34 +0200180 <!-- this is a utility step, for when we want to ignore some texts for any reason (debugging, selective update) -->
181
banspf2b24e62022-03-28 18:12:08 +0200182 <xsl:otherwise>
banspd1bf1db2022-04-04 02:16:24 +0200183
bansp9dc10002022-05-17 22:33:34 +0200184 <!--<xsl:message select="f:calc_offsets(doc($ann_segmentation.uri)//tei:body/tei:p[1]/tei:s[2]/tei:seg[1],false())"/>-->
banspd1bf1db2022-04-04 02:16:24 +0200185
bansp9dc10002022-05-17 22:33:34 +0200186<!-- <xsl:message select="doc($ann_segmentation.uri)//tei:body/tei:p[1]/tei:s[1]/tei:seg[1] || f:is_preceded_by_ws(doc($ann_segmentation.uri)//tei:body/tei:p[1]/tei:s[1]/tei:seg[1])"/>
187 <xsl:message select="doc($ann_segmentation.uri)//tei:body/tei:p[1]/tei:s[1]/tei:seg[3] || f:is_preceded_by_ws(doc($ann_segmentation.uri)//tei:body/tei:p[1]/tei:s[1]/tei:seg[3])"/>
188-->
189 <xsl:call-template name="process_single_sample">
banspf2b24e62022-03-28 18:12:08 +0200190 <xsl:with-param name="text.xml" as="document-node()" select="."/>
191 <xsl:with-param name="ann_morphosyntax.xml" as="document-node()"
192 select="doc($ann_morphosyntax.uri)"/>
193 <xsl:with-param name="ann_segmentation.xml" as="document-node()"
194 select="doc($ann_segmentation.uri)"/>
195 <xsl:with-param name="my_textID" select="$my_textID" as="xs:string"/>
bansp9dc10002022-05-17 22:33:34 +0200196 </xsl:call-template>
banspf2b24e62022-03-28 18:12:08 +0200197 </xsl:otherwise>
198 </xsl:choose>
banspe726b4a2022-03-28 05:47:45 +0200199 </xsl:for-each>
bansp8f6700b2022-03-27 05:27:09 +0200200 </xsl:template>
201
202 <xsl:template name="process_single_sample">
banspe726b4a2022-03-28 05:47:45 +0200203 <xsl:param name="text.xml" as="document-node()"/>
204 <xsl:param name="ann_morphosyntax.xml" as="document-node()"/>
205 <xsl:param name="ann_segmentation.xml" as="document-node()"/>
banspd1bf1db2022-04-04 02:16:24 +0200206 <xsl:param name="my_textID" as="xs:string" select="'0-BAD_textID'"/>
bansp9dc10002022-05-17 22:33:34 +0200207 <!-- empty textID should never happen, but if it does, it will be signalled at the top of the output -->
banspe726b4a2022-03-28 05:47:45 +0200208
209 <xsl:variable name="targetBaseDir" as="xs:string" select="$targetCorpusDir_slashed || $docID || '/' || $my_textID"/>
210
211 <xsl:variable name="compoundID" as="xs:string"
212 select="$corpusID || '_' || $docID || '.' || $my_textID"/>
213 <!-- this is what occurs in the text and data layers as @docid -->
214
Piotr Banski6a4a2522022-05-24 01:16:47 +0200215<!-- this is space devoted to recalculating word offsets on the basis of ann_segmentation.xml (rather than text.xml).
216 The results should be available to all annotation files, so we prepare a map here and send it off to whichever
217 annotation layer needs it. -->
218
219<!-- This is done is several steps, because I wanted to be able to look stuff up. There should probably be some
220 idiomatic way to reduce the memory footprint of these variables - I'll be happy to learn about it. -->
221
222 <!--<xsl:variable name="map_w" as="map(xs:untypedAtomic,item()+)">
223 <xsl:variable name="segs" select="$ann_segmentation.xml/tei:teiCorpus/tei:TEI/tei:text/tei:body/tei:p/tei:s//tei:seg[count(@nkjp:rejected) eq 0]" as="element()+"/>
224 <xsl:map>
225 <xsl:for-each-group select="$segs" group-by="ancestor::tei:p[1]/@xml:id">
226 <xsl:variable name="current-p" select="current-grouping-key()"/>
227 <xsl:for-each-group select="current-group()" group-by="ancestor::tei:s[1]/@xml:id">
228 <xsl:variable name="current-s" select="current-grouping-key()"/>
229 <xsl:for-each select="current-group()">
230 <xsl:map-entry key="@xml:id" select="$current-p, $current-s, position(), f:is_preceded_by_ws(.), normalize-space(tei:w)"/>
231 </xsl:for-each>
232 </xsl:for-each-group>
233 </xsl:for-each-group>
234 </xsl:map>
235 </xsl:variable>-->
236
237 <!--<xsl:variable name="map_p-s-w" as="map(xs:untypedAtomic,item()+)">
238 <xsl:map>
239 <xsl:for-each-group select="map:keys($map_w)" group-by="map:get($map_w, .)[1]">
240 <xsl:sort select="xs:integer(substring-before(substring-after(., 'segm_'), '.'))"
241 order="ascending"/>
242 <xsl:sort select="xs:integer(substring-before(substring-after(., '.'), '-'))"
243 order="ascending"/>
244 <xsl:variable name="current-p-pos" select="fn:position()" as="xs:integer"/>
245 <!-\- the above is used in the sentence loop, when we check if it's text-initial -\->
246 <xsl:variable name="current-p" select="fn:current-grouping-key()" as="xs:string"/> <!-\-xs:untypedAtomic-\->
247 <xsl:variable name="p-length" select="
248 sum(for $id in current-group()
249 return
250 string-length(map:get($map_w, $id)[5])) + count(current-group()) - xs:integer(position() ne 1) -
251 count(fn:filter(current-group(), function ($w-id) {
252 map:get($map_w, $w-id)[4] eq false()
253 }))"/>
254 <!-\- The general algorithm is:
255 * count and sum the lengths of all the words
256 * add 'whitespace' for all of them (= count them and add that), and then
257 * subtract whitespace for those of them that are not actually preceded by it
258 and if the 1st word is_preceded_by_ws then subtract 1
259 because identifying that 1st word would require an extra step, we're taking a shortcut via position() -
260 and that strongly depends on the presence of the xsl:sort instructions -\->
261
262 <xsl:message select="'sum: ' || sum( for $id in current-group() return string-length(map:get($map_w, $id)[5]) )"/>
263 <!-\-<xsl:message select="for $id in current-group() return (string-length(map:get($map_w, $id)[5]),map:get($map_w, $id)[4] )"/>-\->
264 <xsl:message select="'cur-group count: ' || count(fn:current-group())"/>
265 <!-\-<xsl:message select="fn:for-each(current-group(), function($w-id) { map:get($map_w,$w-id)[4] eq false() } )"></xsl:message>-\->
266 <xsl:message select="'subtract:' || count(fn:filter(current-group(), function($w-id) { map:get($map_w,$w-id)[4] eq false() } ))"></xsl:message>
267 <xsl:message select="'position: ' || position() || ', xs:integer(position() ne 1)=' || xs:integer(position() ne 1)"></xsl:message>
268 <xsl:message select="'p-length: ' || $p-length"/>
269
270 <xsl:map-entry key="current-grouping-key()" select="'p', position(), $p-length"/>
271
272 <xsl:message select="'p: ', $current-p || ' pos:' || position(), current-group()"/>
273
274 <xsl:for-each-group select="current-group()" group-by="map:get($map_w, .)[2]">
275 <xsl:sort select="xs:integer(substring-before(substring-after(., 'segm_'), '.'))"
276 order="ascending"/>
277 <xsl:sort select="xs:integer(substring-before(substring-after(., '.'), '-'))"
278 order="ascending"/>
279 <xsl:variable name="current-s" select="fn:current-grouping-key()" as="xs:string"/> <!-\-xs:untypedAtomic-\->
280 <xsl:variable name="s-length" select="
281 sum(for $id in current-group()
282 return
283 string-length(map:get($map_w, $id)[5])) + count(current-group()) - xs:integer($current-p-pos ne 1) -
284 count(fn:filter(current-group(), function ($w-id) {
285 map:get($map_w, $w-id)[4] eq false()
286 }))"/>
287
288
289 <xsl:map-entry key="current-grouping-key()" select="'s', position(), $s-length, $current-p"/>
290
291 <xsl:message select="'s: ', position(), current-group()"/>
292
293 <xsl:for-each select="current-group()">
294 <xsl:sort select="map:get($map_w, .)[3]" order="ascending"/>
295 <xsl:map-entry key="." select="'w', position(), string-length(map:get($map_w, .)[5]), $current-s, map:get($map_w, .)[4]"/>
296 <!-\- <xsl:message select="map:get($map_w, .)[5]"/>-\->
297 </xsl:for-each>
298 </xsl:for-each-group>
299 </xsl:for-each-group>
300 </xsl:map>
301 </xsl:variable>-->
302
303 <!--<xsl:variable name="offsets-p" as="map(xs:untypedAtomic,item()+)">
304 <xsl:map>
305 <xsl:for-each select="
306 fn:filter(map:keys($map_p-s-w), function ($ele) {
307 map:get($map_p-s-w, $ele)[1] eq 'p'
308 })">
309 <xsl:sort select="map:get($map_p-s-w, .)[2]"/>
310
311 <xsl:map-entry key="." select="map:get($map_p-s-w, .)[2]"/>
312
313
314 </xsl:for-each>
315 </xsl:map>
316 </xsl:variable>-->
317
318
319
320
321 <!-- <xsl:message select="'map_w size: ' || map:size($map_w)"/>
322 <xsl:message select="'map_s-p size: ' || map:size($map_p-s-w)"/>
323 <xsl:message select="'offsets size: ' || map:size($offsets-p)"/>-->
324
banspe726b4a2022-03-28 05:47:45 +0200325
bansp5e2d1c02022-03-10 04:51:40 +0100326 <xsl:call-template name="create_data">
bansp9dc10002022-05-17 22:33:34 +0200327 <!--<xsl:with-param name="text.xml" select="$text.xml" as="document-node()"/>-->
328 <xsl:with-param name="ann_segmentation.xml" select="$ann_segmentation.xml" as="document-node()"/>
banspe726b4a2022-03-28 05:47:45 +0200329 <xsl:with-param name="compoundID" select="$compoundID" as="xs:string"/>
330 <xsl:with-param name="target" select="$targetBaseDir || '/data.xml'" as="xs:string"/>
bansp5f841732022-03-16 06:27:31 +0100331 </xsl:call-template>
332
Piotr Banski4f4c2d22022-05-19 01:44:32 +0200333 <xsl:call-template name="create_struct">
banspe726b4a2022-03-28 05:47:45 +0200334 <xsl:with-param name="compoundID" select="$compoundID" as="xs:string"/>
bansp5f841732022-03-16 06:27:31 +0100335 <xsl:with-param name="ann_segmentation.xml" select="$ann_segmentation.xml"
336 as="document-node()"/>
banspe726b4a2022-03-28 05:47:45 +0200337 <xsl:with-param name="target" select="$targetBaseDir || '/struct/structure.xml'" as="xs:string"
bansp5f841732022-03-16 06:27:31 +0100338 />
Piotr Banski6a4a2522022-05-24 01:16:47 +0200339 <!--<xsl:with-param name="offsets" select="$offsets" as="map(xs:string, xs:integer+)"/>-->
bansp5f841732022-03-16 06:27:31 +0100340 </xsl:call-template>
Piotr Banski4f4c2d22022-05-19 01:44:32 +0200341
bansp9dc10002022-05-17 22:33:34 +0200342<!-- <xsl:call-template name="create_morpho">
bansp5f841732022-03-16 06:27:31 +0100343 <xsl:with-param name="text.xml" select="$text.xml" as="document-node()"/>
banspe726b4a2022-03-28 05:47:45 +0200344 <xsl:with-param name="compoundID" select="$compoundID" as="xs:string"/>
bansp3e5b20c2022-03-18 20:22:31 +0100345 <xsl:with-param name="ann_segmentation.xml" select="$ann_segmentation.xml"
346 as="document-node()"/>
bansp5f841732022-03-16 06:27:31 +0100347 <xsl:with-param name="ann_morphosyntax.xml" select="$ann_morphosyntax.xml"
348 as="document-node()"/>
banspe726b4a2022-03-28 05:47:45 +0200349 <xsl:with-param name="target" select="$targetBaseDir || '/nkjp/morpho.xml'" as="xs:string"/>
bansp5e2d1c02022-03-10 04:51:40 +0100350 </xsl:call-template>
bansp9dc10002022-05-17 22:33:34 +0200351-->
Piotr Banski6a4a2522022-05-24 01:16:47 +0200352 <!--<xsl:call-template name="create_text_header">
bansp5e2d1c02022-03-10 04:51:40 +0100353 <xsl:with-param name="text.xml" select="$text.xml" as="document-node()"/>
banspe726b4a2022-03-28 05:47:45 +0200354 <xsl:with-param name="compoundID" select="$compoundID" as="xs:string"/>
355 <xsl:with-param name="target" select="$targetBaseDir || '/header.xml'" as="xs:string"/>
Piotr Banski6a4a2522022-05-24 01:16:47 +0200356 </xsl:call-template>-->
357
358 <!--<xsl:message select="'size of offsets in process_single: ' || map:size($offsets)"/>-->
bansp5e2d1c02022-03-10 04:51:40 +0100359
bansp5e2d1c02022-03-10 04:51:40 +0100360 </xsl:template>
361
362 <!-- ************************** data.xml ******************* -->
363
364 <xsl:template name="create_data">
bansp9dc10002022-05-17 22:33:34 +0200365 <!--<xsl:param name="text.xml" as="document-node()"/>-->
366 <xsl:param name="ann_segmentation.xml" as="document-node()"/>
banspe726b4a2022-03-28 05:47:45 +0200367 <xsl:param name="compoundID" as="xs:string"/>
bansp5f841732022-03-16 06:27:31 +0100368 <xsl:param name="target" as="xs:string"/>
bansp5e2d1c02022-03-10 04:51:40 +0100369 <!-- create the data.xml file -->
370 <xsl:result-document encoding="UTF-8" method="xml" indent="yes"
bansp5f841732022-03-16 06:27:31 +0100371 xpath-default-namespace="{$KorAP_namespace}" href="{$target}">
bansp5e2d1c02022-03-10 04:51:40 +0100372
Akron9a8ee3e2022-01-31 13:51:49 +0100373 <xsl:processing-instruction name="xml-model">href=&quot;text.rng&quot; type=&quot;application/xml&quot; schematypens=&quot;http://relaxng.org/ns/structure/1.0&quot;</xsl:processing-instruction>
bansp5e2d1c02022-03-10 04:51:40 +0100374 <xsl:element name="raw_text" namespace="{$KorAP_namespace}">
bansp5f841732022-03-16 06:27:31 +0100375 <xsl:attribute name="docid" select="$compoundID"/>
bansp5e2d1c02022-03-10 04:51:40 +0100376 <xsl:element name="metadata" namespace="{$KorAP_namespace}">
377 <xsl:attribute name="file" select="'metadata.xml'"/>
378 </xsl:element>
379
380 <xsl:element name="text" namespace="{$KorAP_namespace}">
bansp9dc10002022-05-17 22:33:34 +0200381 <xsl:variable name="content" as="xs:string+">
Piotr Banski4f4c2d22022-05-19 01:44:32 +0200382 <xsl:for-each select="$ann_segmentation.xml/tei:teiCorpus/tei:TEI/tei:text/tei:body/tei:p/tei:s//tei:seg[count(@nkjp:rejected) eq 0]">
bansp9dc10002022-05-17 22:33:34 +0200383 <xsl:sequence select="
384 if (f:is_preceded_by_ws(.)) then
385 ' '
386 else
387 '', ./tei:w"/>
388 </xsl:for-each>
389 </xsl:variable>
390 <xsl:value-of select="string-join($content)"/>
bansp5e2d1c02022-03-10 04:51:40 +0100391 </xsl:element>
Akron9a8ee3e2022-01-31 13:51:49 +0100392 </xsl:element>
banspf79443e2022-02-25 14:25:33 +0100393 </xsl:result-document>
Akron9a8ee3e2022-01-31 13:51:49 +0100394 </xsl:template>
395
bansp5f841732022-03-16 06:27:31 +0100396 <!-- ************************** struct ******************* -->
397
398 <xsl:template name="create_struct">
banspe726b4a2022-03-28 05:47:45 +0200399 <xsl:param name="compoundID" as="xs:string"/>
bansp5f841732022-03-16 06:27:31 +0100400 <xsl:param name="ann_segmentation.xml" as="document-node()"/>
401 <xsl:param name="target" as="xs:string"/>
Piotr Banski6a4a2522022-05-24 01:16:47 +0200402 <!--<xsl:param name="offsets" as="map(xs:string, xs:integer+)"/>-->
Piotr Banski4f4c2d22022-05-19 01:44:32 +0200403
bansp5f841732022-03-16 06:27:31 +0100404 <xsl:result-document encoding="UTF-8" method="xml" indent="yes"
405 xpath-default-namespace="http://ids-mannheim.de/ns/KorAP" href="{$target}">
406 <xsl:processing-instruction name="xml-model">href=&quot;span.rng&quot; type=&quot;application/xml&quot; schematypens=&quot;http://relaxng.org/ns/structure/1.0&quot;</xsl:processing-instruction>
407 <xsl:element name="layer" namespace="{$KorAP_namespace}">
408 <xsl:attribute name="docid" select="$compoundID"/>
409 <xsl:attribute name="version" select="$KorAP-XML_version"/>
Piotr Banski6a4a2522022-05-24 01:16:47 +0200410
411 <!--<xsl:message select="'size of offsets in create_struct: ' || map:size($offsets)"/>-->
bansp5f841732022-03-16 06:27:31 +0100412
413 <xsl:element name="spanList" namespace="{$KorAP_namespace}">
Piotr Banski6a4a2522022-05-24 01:16:47 +0200414 <xsl:apply-templates select="$ann_segmentation.xml//tei:text" mode="struct">
415 <!--<xsl:with-param name="offsets" as="map(xs:string, xs:integer+)" tunnel="yes">
416 <xsl:map>
417 <xsl:for-each select="tail(fn:accumulator-after('elem-offset-seq'))">
418 <xsl:map-entry key="map:get(., 'id')" select="map:get(., 'start'), map:get(., 'end')"/>
419 </xsl:for-each>
420 </xsl:map>
421 </xsl:with-param>-->
422 </xsl:apply-templates>
bansp5f841732022-03-16 06:27:31 +0100423 </xsl:element>
424 </xsl:element>
425 </xsl:result-document>
426 </xsl:template>
427
428 <xsl:template match="tei:*" mode="struct">
429 <xsl:param name="ini" as="xs:integer" required="no" select="0"/>
430 <xsl:param name="fin" as="xs:integer" required="no" select="999999999"/>
431 <xsl:param name="index" as="xs:integer" required="no" select="1"/>
Piotr Banski5fe4bae2022-05-24 02:40:13 +0200432 <!--<xsl:param name="offsets" as="map(xs:string, xs:integer+)" tunnel="yes"/>-->
Piotr Banski6a4a2522022-05-24 01:16:47 +0200433
Piotr Banski5fe4bae2022-05-24 02:40:13 +0200434 <xsl:variable name="offsets" as="map(xs:string, xs:integer+)"
435 select="map:merge(tail(fn:accumulator-after('elem-offset-seq')))"/>
Piotr Banski6a4a2522022-05-24 01:16:47 +0200436
437<!-- <xsl:message select="'size of offsets in tei:* ' || map:size($offsets)"/>-->
Piotr Banski4f4c2d22022-05-19 01:44:32 +0200438
bansp9dc10002022-05-17 22:33:34 +0200439 <!-- It's so spread out because I want to make sure to be able to look up the individual
bansp3e5b20c2022-03-18 20:22:31 +0100440 constituent values, should anything go wrong; optimization will come when it's worked against a larger dataset -->
bansp5f841732022-03-16 06:27:31 +0100441 <xsl:variable name="my_name" select="local-name()" as="xs:string"/>
442 <xsl:variable name="preceding" select="preceding-sibling::*[local-name(.) eq $my_name]"/>
443 <xsl:variable name="preceding-count" select="count($preceding)"/>
bansp9dc10002022-05-17 22:33:34 +0200444
bansp5f841732022-03-16 06:27:31 +0100445 <xsl:variable name="outside-preceding-count" as="xs:integer">
446 <xsl:choose>
447 <xsl:when test="self::tei:s or self::tei:p">
448 <xsl:choose>
449 <xsl:when test="$preceding-count">
450 <xsl:sequence select="
451 sum(for $p in $preceding
452 return
Piotr Banski4f4c2d22022-05-19 01:44:32 +0200453 count($p/descendant::*))"/> <!--mind @nkjp:rejected -->
bansp5f841732022-03-16 06:27:31 +0100454 </xsl:when>
455 <xsl:otherwise>
456 <xsl:sequence select="0"/>
457 </xsl:otherwise>
458 </xsl:choose>
459 </xsl:when>
460 <xsl:otherwise>
461 <xsl:sequence select="0"/>
462 </xsl:otherwise>
463 </xsl:choose>
464 </xsl:variable>
bansp9dc10002022-05-17 22:33:34 +0200465
bansp5f841732022-03-16 06:27:31 +0100466 <xsl:variable name="my_index" select="$index + 1 + $preceding-count + $outside-preceding-count"
467 as="xs:integer"/>
banspb5992532022-03-29 15:55:44 +0200468
469
470 <!--<xsl:copy select="//tei:seg[count(@nkjp:rejected) ne 0 and @nkjp:rejected ne 'true']"></xsl:copy>-->
bansp5f841732022-03-16 06:27:31 +0100471
472 <xsl:variable name="start" as="xs:integer">
473 <xsl:choose>
474 <xsl:when test="self::tei:text or self::tei:body">
475 <xsl:sequence select="0"/>
476 </xsl:when>
477 <xsl:when test="self::tei:p">
478 <xsl:variable name="first_corresp"
479 select="descendant::tei:s[1]/descendant::tei:seg[1]/attribute::corresp"
480 as="attribute(corresp)"/>
481 <xsl:variable name="numbers" select="substring-after(substring-before($first_corresp,')'),',')"/>
482 <xsl:sequence select="xs:integer(substring-before($numbers,','))"/>
483 </xsl:when>
484 <xsl:when test="self::tei:s">
485 <xsl:variable name="first_corresp"
486 select="descendant::tei:seg[1]/attribute::corresp"
487 as="attribute(corresp)"/>
488 <xsl:variable name="numbers" select="substring-after(substring-before($first_corresp,')'),',')"/>
489 <xsl:sequence select="xs:integer(substring-before($numbers,','))"/>
490 </xsl:when>
491 <xsl:when test="self::tei:seg">
492 <xsl:variable name="numbers" select="substring-after(substring-before(@corresp,')'),',')"/>
493 <xsl:sequence select="xs:integer(substring-before($numbers,','))"/>
494 </xsl:when>
Piotr Banski6a4a2522022-05-24 01:16:47 +0200495 <xsl:otherwise>
496 <xsl:message terminate="yes" select="'Element not handled: ' || fn:local-name()"/>
497 </xsl:otherwise>
bansp5f841732022-03-16 06:27:31 +0100498 </xsl:choose>
499 </xsl:variable>
Piotr Banski4f4c2d22022-05-19 01:44:32 +0200500
bansp5f841732022-03-16 06:27:31 +0100501 <xsl:variable name="end" as="xs:integer" select="f:calc_content_length(.)">
502 </xsl:variable>
bansp3e5b20c2022-03-18 20:22:31 +0100503
Piotr Banski6a4a2522022-05-24 01:16:47 +0200504
505
bansp5f841732022-03-16 06:27:31 +0100506 <xsl:element name="span" namespace="{$KorAP_namespace}">
507 <xsl:attribute name="id" select="'s' || $my_index"/>
508 <xsl:attribute name="from" select="$start"/>
509 <xsl:attribute name="to" select="$end"/>
Piotr Banski5fe4bae2022-05-24 02:40:13 +0200510 <xsl:attribute name="accumulator" select="string-join(map:get($offsets,string(@xml:id)),',')"/> <!--test-->
bansp5f841732022-03-16 06:27:31 +0100511 <xsl:attribute name="l" select="f:compute_nesting(.)"/>
512 <xsl:element name="fs" namespace="http://www.tei-c.org/ns/1.0">
bansp3e5b20c2022-03-18 20:22:31 +0100513 <xsl:attribute name="type" select="'struct'"></xsl:attribute> <!-- STRUCT vs. LEX -->
bansp5f841732022-03-16 06:27:31 +0100514 <xsl:element name="f" namespace="http://www.tei-c.org/ns/1.0">
bansp3e5b20c2022-03-18 20:22:31 +0100515 <xsl:attribute name="name" select="'name'"/>
516 <xsl:value-of select="local-name()"/>
bansp5f841732022-03-16 06:27:31 +0100517 </xsl:element>
Piotr Banski5fe4bae2022-05-24 02:40:13 +0200518 <xsl:if test="local-name() eq 'seg'"> <!--test-->
519 <xsl:element name="f" namespace="http://www.tei-c.org/ns/1.0">
520 <xsl:attribute name="name" select="'orth'"/>
521 <xsl:value-of select="fn:normalize-space(.)"/>
522 </xsl:element>
523 </xsl:if>
bansp5f841732022-03-16 06:27:31 +0100524 <xsl:if test="count(@*)">
525 <xsl:element name="f" namespace="http://www.tei-c.org/ns/1.0">
526 <xsl:attribute name="name" select="'attr'"/>
527 <xsl:element name="fs" namespace="http://www.tei-c.org/ns/1.0">
528 <xsl:attribute name="type" select="'attr'"/>
529 <xsl:for-each select="@*">
530 <xsl:element name="f" namespace="http://www.tei-c.org/ns/1.0">
531 <xsl:attribute name="name" select="local-name(.)"/>
532 <xsl:value-of select="."/>
533 </xsl:element>
534 </xsl:for-each>
535 </xsl:element>
536 </xsl:element>
537 </xsl:if>
538 </xsl:element>
539 </xsl:element>
540 <xsl:apply-templates mode="struct">
541 <xsl:with-param name="ini" select="$start" as="xs:integer"/>
542 <xsl:with-param name="fin" select="$end" as="xs:integer"/>
543 <xsl:with-param name="index" select="$my_index"/>
544 </xsl:apply-templates>
545 </xsl:template>
546
547 <!-- ************************** morpho ******************* -->
548
549 <xsl:template name="create_morpho">
550 <xsl:param name="text.xml" as="document-node()"/>
banspe726b4a2022-03-28 05:47:45 +0200551 <xsl:param name="compoundID" as="xs:string"/>
bansp3e5b20c2022-03-18 20:22:31 +0100552 <xsl:param name="ann_segmentation.xml" as="document-node()"/>
bansp5f841732022-03-16 06:27:31 +0100553 <xsl:param name="ann_morphosyntax.xml" as="document-node()"/>
554 <xsl:param name="target" as="xs:string"/>
555
556 <xsl:result-document encoding="UTF-8" method="xml" indent="yes"
557 xpath-default-namespace="http://ids-mannheim.de/ns/KorAP" href="{$target}">
558 <xsl:processing-instruction name="xml-model">href=&quot;span.rng&quot; type=&quot;application/xml&quot; schematypens=&quot;http://relaxng.org/ns/structure/1.0&quot;</xsl:processing-instruction>
bansp3e5b20c2022-03-18 20:22:31 +0100559 <xsl:element name="layer" namespace="{$KorAP_namespace}">
560 <xsl:attribute name="docid" select="$compoundID"/>
561 <xsl:attribute name="version" select="$KorAP-XML_version"/>
562
563 <xsl:element name="spanList" namespace="{$KorAP_namespace}">
564 <xsl:apply-templates select="$ann_segmentation.xml//tei:text" mode="morpho">
565 <xsl:with-param name="ann_morphosyntax.xml" select="$ann_morphosyntax.xml" as="document-node()"/>
566 </xsl:apply-templates>
567 </xsl:element>
568 </xsl:element>
bansp5f841732022-03-16 06:27:31 +0100569 </xsl:result-document>
570 </xsl:template>
571
bansp3e5b20c2022-03-18 20:22:31 +0100572 <xsl:template match="tei:*" mode="morpho">
573 <xsl:param name="ini" as="xs:integer" required="no" select="0"/>
574 <xsl:param name="fin" as="xs:integer" required="no" select="999999999"/>
575 <xsl:param name="index" as="xs:integer" required="no" select="1"/>
576 <xsl:param name="ann_morphosyntax.xml" as="document-node()"/>
577 <xsl:variable name="my_name" select="local-name()" as="xs:string"/>
578 <xsl:variable name="preceding" select="preceding-sibling::*[local-name(.) eq $my_name]"/>
579 <xsl:variable name="preceding-count" select="count($preceding)"/>
580 <xsl:variable name="outside-preceding-count" as="xs:integer">
581 <xsl:choose>
582 <xsl:when test="self::tei:s or self::tei:p">
583 <xsl:choose>
584 <xsl:when test="$preceding-count">
585 <xsl:sequence select="
586 sum(for $p in $preceding
587 return
588 count($p/descendant::*))"/>
589 </xsl:when>
590 <xsl:otherwise>
591 <xsl:sequence select="0"/>
592 </xsl:otherwise>
593 </xsl:choose>
594 </xsl:when>
595 <xsl:otherwise>
596 <xsl:sequence select="0"/>
597 </xsl:otherwise>
598 </xsl:choose>
599 </xsl:variable>
600 <xsl:variable name="my_index" select="$index + 1 + $preceding-count + $outside-preceding-count"
601 as="xs:integer"/>
602
603 <xsl:variable name="start" as="xs:integer">
604 <xsl:choose>
605 <xsl:when test="self::tei:text or self::tei:body">
606 <xsl:sequence select="0"/>
607 </xsl:when>
608 <xsl:when test="self::tei:p">
609 <xsl:variable name="first_corresp"
610 select="descendant::tei:s[1]/descendant::tei:seg[1]/attribute::corresp"
611 as="attribute(corresp)"/>
612 <xsl:variable name="numbers" select="substring-after(substring-before($first_corresp,')'),',')"/>
613 <xsl:sequence select="xs:integer(substring-before($numbers,','))"/>
614 </xsl:when>
615 <xsl:when test="self::tei:s">
616 <xsl:variable name="first_corresp"
617 select="descendant::tei:seg[1]/attribute::corresp"
618 as="attribute(corresp)"/>
619 <xsl:variable name="numbers" select="substring-after(substring-before($first_corresp,')'),',')"/>
620 <xsl:sequence select="xs:integer(substring-before($numbers,','))"/>
621 </xsl:when>
622 <!--<xsl:when test="self::tei:seg">
623 <xsl:variable name="numbers" select="substring-after(substring-before(@corresp,')'),',')"/>
624 <xsl:sequence select="xs:integer(substring-before($numbers,','))"/>
625 </xsl:when>-->
626 </xsl:choose>
627 </xsl:variable>
628 <xsl:variable name="end" as="xs:integer" select="f:calc_content_length(.)">
629 </xsl:variable>
630
631 <xsl:apply-templates mode="morpho">
632 <xsl:with-param name="ini" select="$start" as="xs:integer"/>
633 <xsl:with-param name="fin" select="$end" as="xs:integer"/>
634 <xsl:with-param name="index" select="$my_index"/>
635 <xsl:with-param name="ann_morphosyntax.xml" select="$ann_morphosyntax.xml" as="document-node()"/>
636 </xsl:apply-templates>
637 </xsl:template>
bansp5f841732022-03-16 06:27:31 +0100638
bansp3e5b20c2022-03-18 20:22:31 +0100639 <xsl:template match="tei:seg" mode="morpho">
640 <xsl:param name="ini" as="xs:integer" required="no" select="0"/>
641 <xsl:param name="fin" as="xs:integer" required="no" select="999999999"/>
642 <xsl:param name="index" as="xs:integer" required="no" select="1"/>
643 <xsl:param name="ann_morphosyntax.xml" as="document-node()"/>
644 <!-- I have made a major mess here, but it works... it's so spread out
645 because I wanted to make sure to be able to look up the individual
646 constituent values, should anything go wrong -->
647 <xsl:variable name="my_name" select="local-name()" as="xs:string"/>
648 <xsl:variable name="my_id" select="@xml:id" as="xs:string"/>
649 <xsl:variable name="my_morph-seg" as="node()" select="$ann_morphosyntax.xml//tei:seg[substring-after(@corresp,'#') eq $my_id]"/>
650 <xsl:variable name="my_disamb" select="$my_morph-seg//tei:fs/tei:f[@name eq 'disamb']" as="node()"/>
651 <xsl:variable name="my_choice-id" select="substring-after($my_disamb//tei:f[@name eq 'choice']/@fVal,'#')" as="xs:string"/>
652 <xsl:variable name="my_choice-lex" select="$my_morph-seg//tei:f[@name eq 'interps']/tei:fs[@type eq 'lex'][descendant::tei:symbol[@xml:id eq $my_choice-id]]" as="node()"/>
653 <xsl:variable name="chosen-msd" as="xs:string" select="$my_choice-lex/descendant::tei:symbol[@xml:id eq $my_choice-id]/@value"/>
654 <xsl:variable name="preceding" select="preceding-sibling::*[local-name(.) eq $my_name]"/>
655 <xsl:variable name="preceding-count" select="count($preceding)"/>
banspe726b4a2022-03-28 05:47:45 +0200656 <!--<xsl:variable name="outside-preceding-count" as="xs:integer">
bansp3e5b20c2022-03-18 20:22:31 +0100657 <xsl:choose>
banspe726b4a2022-03-28 05:47:45 +0200658 <xsl:when test="self::tei:s or self::tei:p"> <!-\- THIS NEEDS TO BE REVISITED AFTER THIS TEMPLATE HAS BECOME MORE SPECIFIC -\->
bansp3e5b20c2022-03-18 20:22:31 +0100659 <xsl:choose>
banspe726b4a2022-03-28 05:47:45 +0200660 <xsl:when test="$preceding-count"> commented out for now
bansp3e5b20c2022-03-18 20:22:31 +0100661 <xsl:sequence select="
662 sum(for $p in $preceding
663 return
664 count($p/descendant::*))"/>
665 </xsl:when>
666 <xsl:otherwise>
667 <xsl:sequence select="0"/>
668 </xsl:otherwise>
669 </xsl:choose>
670 </xsl:when>
671 <xsl:otherwise>
672 <xsl:sequence select="0"/>
673 </xsl:otherwise>
674 </xsl:choose>
banspe726b4a2022-03-28 05:47:45 +0200675 </xsl:variable>-->
676 <xsl:variable name="my_index" select="$index + 1 + $preceding-count" as="xs:integer"/>
bansp3e5b20c2022-03-18 20:22:31 +0100677
678 <xsl:variable name="start" as="xs:integer">
679 <xsl:variable name="numbers" select="substring-after(substring-before(@corresp,')'),',')"/>
680 <xsl:sequence select="xs:integer(substring-before($numbers,','))"/>
681 </xsl:variable>
682 <xsl:variable name="end" as="xs:integer" select="f:calc_content_length(.)">
683 </xsl:variable>
684 <xsl:element name="span" namespace="{$KorAP_namespace}">
685 <xsl:attribute name="id" select="'s' || $my_index"/>
686 <xsl:attribute name="from" select="$start"/>
687 <xsl:attribute name="to" select="$end"/>
688 <xsl:attribute name="l" select="f:compute_nesting(.)"/>
689 <xsl:element name="fs" namespace="http://www.tei-c.org/ns/1.0">
690 <xsl:attribute name="type" select="'lex'"/>
691 <xsl:element name="f" namespace="http://www.tei-c.org/ns/1.0">
692 <xsl:attribute name="name" select="'lex'"/>
693 <xsl:element name="fs" namespace="http://www.tei-c.org/ns/1.0">
694 <xsl:comment select="$my_morph-seg//tei:fs/tei:f[@name eq 'orth']/tei:string"/>
695
696
697 <xsl:element name="f" namespace="http://www.tei-c.org/ns/1.0">
698 <xsl:attribute name="name" select="'lemma'"/>
699 <xsl:value-of select="$my_choice-lex/tei:f[@name eq 'base']/tei:string"/>
700 </xsl:element>
701 <xsl:element name="f" namespace="http://www.tei-c.org/ns/1.0">
702 <xsl:attribute name="name" select="'pos'"/>
703 <xsl:value-of select="$my_choice-lex/tei:f[@name eq 'ctag']/tei:symbol/@value"/>
704 </xsl:element>
705 <xsl:if test="string-length($chosen-msd)">
706 <xsl:element name="f" namespace="http://www.tei-c.org/ns/1.0">
707 <xsl:attribute name="name" select="'msd'"/>
708 <xsl:value-of select="$chosen-msd"/>
709 </xsl:element>
710 </xsl:if>
711 <xsl:if test="$my_morph-seg//tei:fs/tei:f[@name eq 'nps']">
712 <xsl:element name="f" namespace="http://www.tei-c.org/ns/1.0">
713 <xsl:attribute name="name" select="'join'"/>
714 <xsl:value-of select="'left'"/>
715 </xsl:element>
716 </xsl:if>
717 </xsl:element>
718 </xsl:element>
719 </xsl:element>
720 </xsl:element>
banspe726b4a2022-03-28 05:47:45 +0200721 <xsl:apply-templates mode="morpho">
bansp3e5b20c2022-03-18 20:22:31 +0100722 <xsl:with-param name="ini" select="$start" as="xs:integer"/>
723 <xsl:with-param name="fin" select="$end" as="xs:integer"/>
724 <xsl:with-param name="index" select="$my_index"/>
725 <xsl:with-param name="ann_morphosyntax.xml" select="$ann_morphosyntax.xml" as="document-node()"/>
banspe726b4a2022-03-28 05:47:45 +0200726 </xsl:apply-templates>-->
bansp3e5b20c2022-03-18 20:22:31 +0100727 </xsl:template>
banspe726b4a2022-03-28 05:47:45 +0200728
bansp5f841732022-03-16 06:27:31 +0100729 <!-- ************************** TEXT header ******************* -->
730
731 <xsl:template name="create_text_header">
732 <xsl:param name="text.xml" as="document-node()"/>
banspe726b4a2022-03-28 05:47:45 +0200733 <xsl:param name="compoundID" as="xs:string"/>
bansp5f841732022-03-16 06:27:31 +0100734 <xsl:param name="target" as="xs:string"/>
735
736 <!-- create the local header.xml file -->
737 <xsl:result-document encoding="UTF-8" method="xml" indent="yes"
738 xpath-default-namespace="http://ids-mannheim.de/ns/KorAP" href="{$target}">
739
740 <idsHeader type="document" pattern="text" status="new" version="1.1" TEIform="teiHeader">
banspe726b4a2022-03-28 05:47:45 +0200741 <xsl:apply-templates select="$text.xml//tei:TEI/tei:teiHeader/tei:*" mode="text">
742 <xsl:with-param name="compoundID" as="xs:string" select="$compoundID" tunnel="yes"/>
743 </xsl:apply-templates>
bansp5f841732022-03-16 06:27:31 +0100744 </idsHeader>
745 </xsl:result-document>
746 </xsl:template>
747
748 <xsl:template match="tei:fileDesc" mode="text">
bansp9103aab2022-03-19 05:10:21 +0100749 <xsl:element name="{local-name()}">
bansp5f841732022-03-16 06:27:31 +0100750 <xsl:apply-templates mode="text"/>
bansp9103aab2022-03-19 05:10:21 +0100751 </xsl:element>
bansp5f841732022-03-16 06:27:31 +0100752 </xsl:template>
753
754 <xsl:template match="tei:title" mode="text">
755 <t.title>
756 <xsl:apply-templates/>
757 </t.title>
758 </xsl:template>
759
760 <xsl:template match="tei:titleStmt" mode="text">
banspe726b4a2022-03-28 05:47:45 +0200761 <xsl:param name="compoundID" as="xs:string" tunnel="yes"/>
bansp5f841732022-03-16 06:27:31 +0100762 <titleStmt>
763 <textSigle>
banspe726b4a2022-03-28 05:47:45 +0200764 <xsl:value-of select="$compoundID"/>
bansp5f841732022-03-16 06:27:31 +0100765 </textSigle>
766 <xsl:apply-templates mode="text"/>
767 </titleStmt>
768 </xsl:template>
769
bansp9103aab2022-03-19 05:10:21 +0100770 <xsl:template match="tei:publicationStmt" mode="text">
771 <xsl:element name="{local-name()}">
772 <xsl:apply-templates mode="text"/>
773 </xsl:element>
774 </xsl:template>
775
776 <xsl:template match="tei:availability" mode="text">
777 <xsl:element name="{local-name()}">
778 <xsl:apply-templates mode="text" select="@* | *"/>
779 </xsl:element>
780 </xsl:template>
781
782 <xsl:template match="tei:profileDesc" mode="text">
783 <xsl:element name="{local-name()}">
784 <xsl:apply-templates mode="text"/>
785 </xsl:element>
786 </xsl:template>
bansp5f841732022-03-16 06:27:31 +0100787
bansp9103aab2022-03-19 05:10:21 +0100788 <xsl:template match="tei:textClass" mode="text">
789 <xsl:element name="{local-name()}">
790 <xsl:apply-templates mode="text" select="@* | *"/>
791 </xsl:element>
792 </xsl:template>
793
794 <xsl:template match="tei:catRef" mode="text corpus">
795 <xsl:element name="{local-name()}">
796 <xsl:apply-templates mode="text" select="@* | *"/>
797 </xsl:element>
798 </xsl:template>
799
800 <xsl:template match="@status | @scheme | @target | @type | @xml:id[ancestor::tei:classDecl] | @xml:lang" mode="text corpus">
801 <xsl:copy-of select="."/>
802 </xsl:template>
803
804 <xsl:template match="tei:p" mode="text corpus">
805 <xsl:element name="{local-name()}">
806 <xsl:apply-templates mode="header-text"/>
807 </xsl:element>
808 </xsl:template>
809
810
811 <!-- OPTIMIZATION has to take modes into account -->
bansp5e2d1c02022-03-10 04:51:40 +0100812 <!-- ************************** CORPUS header ******************* -->
813 <xsl:template name="create_corpus_header">
814 <xsl:param name="text.xml" as="document-node()"/>
bansp5f841732022-03-16 06:27:31 +0100815 <xsl:param name="target" as="xs:string"/>
bansp5e2d1c02022-03-10 04:51:40 +0100816
817 <!-- create the corpus-level header.xml file -->
bansp5f841732022-03-16 06:27:31 +0100818 <xsl:result-document encoding="UTF-8" method="xml" indent="yes" href="{$target}">
bansp5e2d1c02022-03-10 04:51:40 +0100819
820 <!--doctype-public="{$publicDoctypeI5}"
821 doctype-system="{$systemDoctypeI5}">
822 these are, sadly, useless
823 -->
824
825 <idsHeader type="corpus" pattern="text" status="new" version="1.1" TEIform="teiHeader">
bansp9103aab2022-03-19 05:10:21 +0100826 <xsl:apply-templates select="$text.xml/tei:teiCorpus/tei:teiHeader/tei:*" mode="corpus"/>
bansp5e2d1c02022-03-10 04:51:40 +0100827 </idsHeader>
828 </xsl:result-document>
829 </xsl:template>
830
831 <xsl:template match="tei:fileDesc" mode="corpus">
bansp9103aab2022-03-19 05:10:21 +0100832 <xsl:element name="{local-name()}">
bansp5e2d1c02022-03-10 04:51:40 +0100833 <xsl:apply-templates mode="corpus"/>
bansp9103aab2022-03-19 05:10:21 +0100834 </xsl:element>
bansp5e2d1c02022-03-10 04:51:40 +0100835 </xsl:template>
bansp9103aab2022-03-19 05:10:21 +0100836
bansp5e2d1c02022-03-10 04:51:40 +0100837
838 <xsl:template match="tei:title" mode="corpus">
839 <c.title>
bansp9103aab2022-03-19 05:10:21 +0100840 <xsl:apply-templates mode="corpus" select="@*"/>
841 <xsl:apply-templates mode="header-text"/>
bansp5e2d1c02022-03-10 04:51:40 +0100842 </c.title>
843 </xsl:template>
844
845 <xsl:template match="tei:titleStmt" mode="corpus">
846 <titleStmt>
847 <korpusSigle>
848 <xsl:value-of select="$corpusID"/>
849 </korpusSigle>
850 <xsl:apply-templates mode="corpus"/>
851 </titleStmt>
852 </xsl:template>
853
bansp9103aab2022-03-19 05:10:21 +0100854 <xsl:template match="tei:publicationStmt" mode="corpus">
855 <xsl:element name="{local-name()}">
856 <xsl:apply-templates mode="corpus"/>
857 </xsl:element>
858 </xsl:template>
859
860 <xsl:template match="tei:availability" mode="corpus">
861 <xsl:element name="{local-name()}">
862 <xsl:apply-templates mode="corpus" select="@* | *"/>
863 </xsl:element>
864 </xsl:template>
865
866 <xsl:template match="tei:encodingDesc" mode="corpus">
867 <xsl:element name="{local-name()}">
868 <xsl:apply-templates mode="corpus"/>
869 </xsl:element>
870 </xsl:template>
871
872 <xsl:template match="tei:classDecl | tei:taxonomy | tei:category | tei:taxonomy/tei:bibl" mode="corpus">
873 <xsl:element name="{local-name()}">
874 <xsl:apply-templates mode="corpus" select="@* | *"/>
875 </xsl:element>
876 </xsl:template>
877
878 <xsl:template match="tei:bibl/tei:title | tei:edition | tei:desc" mode="corpus">
879 <xsl:element name="{local-name()}">
880 <xsl:apply-templates mode="corpus" select="@*"/>
881 <xsl:apply-templates mode="header-text"/>
882 </xsl:element>
883 </xsl:template>
884<!--
885 <xsl:template match="tei:textClass" mode="corpus">
886 <xsl:element name="{local-name()}">
887 <xsl:apply-templates mode="corpus" select="@* | *"/>
888 </xsl:element>
889 </xsl:template>
890
891 <xsl:template match="tei:catRef" mode="corpus">
892 <xsl:element name="{local-name()}">
893 <xsl:apply-templates mode="corpus" select="@* | *"/>
894 </xsl:element>
895 </xsl:template>
896-->
bansp5e2d1c02022-03-10 04:51:40 +0100897
898
899
900 <!-- this template can be called by the XSPEC test; TODO: find a way to call the main() template directly -->
901 <!-- I have not fully handled the param transmission, which would have to be kludged in just for the sake of XSPec,
902 because I'm disabling this for now, due to XSpec design issues; relevant links, a.o.:
903
904 https://stackoverflow.com/questions/64933277/what-is-the-cause-of-error-cannot-execute-xslresult-document-while-evaluating
905 https://www.balisage.net/Proceedings/vol25/html/Galtman01/BalisageVol25-Galtman01.html
906
907 In short: the internal design of XSpec forces kludges when one wants to use xsl:result-document in their stylesheets. But I don't
908 want to be strangled by kludges at the beginning of work, I've already lost quite a bit of time on this investigation,
909 I will therefore "just code" and then can think of externalizing bits of templates if we want to play with tests. For now,
910 I don't want to have to handle context items is a special way inside variables, etc., because I'm not sure it's worth it.
911
912 -->
913 <!--<xsl:template name="test_full">
914 <xsl:param name="corpusID"/>
915 <xsl:param name="docID"/>
916 <xsl:param name="textID"/>
917 <xsl:call-template name="xsl:initial-template"/>
918 </xsl:template>-->
919
Piotr Banski4f4c2d22022-05-19 01:44:32 +0200920 <xsl:function name="f:calc_content_length" as="xs:integer">
921 <xsl:param name="node" as="node()"/>
922 <xsl:choose>
923 <xsl:when test="$node/self::tei:text or $node/self::tei:body">
924 <xsl:variable name="last_corresp"
925 select="$node/descendant::tei:p[last()]/descendant::tei:s[last()]/descendant::tei:seg[count(@nkjp:rejected) eq 0 or @nkjp:rejected ne 'true'][last()]/attribute::corresp"
926 as="attribute(corresp)"/>
927 <xsl:variable name="numbers" select="substring-after(substring-before($last_corresp,')'),',')"/>
928 <xsl:sequence select="xs:integer(substring-before($numbers,',')) + xs:integer(substring-after($numbers,','))"/>
929 </xsl:when>
930 <xsl:when test="$node/self::tei:p">
931 <xsl:variable name="last_corresp"
932 select="$node/descendant::tei:s[last()]/descendant::tei:seg[count(@nkjp:rejected) eq 0 or @nkjp:rejected ne 'true'][last()]/attribute::corresp"
933 as="attribute(corresp)"/>
934 <xsl:variable name="numbers" select="substring-after(substring-before($last_corresp,')'),',')"/>
935 <xsl:sequence select="xs:integer(substring-before($numbers,',')) + xs:integer(substring-after($numbers,','))"/>
936 </xsl:when>
937 <xsl:when test="$node/self::tei:s">
938 <xsl:variable name="last_corresp"
939 select="$node/descendant::tei:seg[count(@nkjp:rejected) eq 0 or @nkjp:rejected ne 'true'][last()]/attribute::corresp"
940 as="attribute(corresp)"/>
941 <xsl:variable name="numbers" select="substring-after(substring-before($last_corresp,')'),',')"/>
942 <xsl:sequence select="xs:integer(substring-before($numbers,',')) + xs:integer(substring-after($numbers,','))"/>
943 </xsl:when>
944 <xsl:otherwise>
945 <xsl:variable name="numbers" select="substring-after(substring-before($node/@corresp,')'),',')"/>
946 <xsl:if test="$node/self::tei:seg and count($node/@nkjp:rejected)">
947 <!-- REMOVE THIS -->
948 <xsl:message select="$numbers"/>
949 </xsl:if>
950 <xsl:sequence select="xs:integer(substring-before($numbers,',')) + xs:integer(substring-after($numbers,','))"/>
951 </xsl:otherwise>
952 </xsl:choose>
953 </xsl:function>
954
955 <xsl:function name="f:calc_offsets" as="xs:integer+">
956 <xsl:param name="node" as="element()"/>
957 <xsl:param name="skip_start" as="xs:boolean" />
958
959 <xsl:variable name="start" as="xs:integer">
960 <xsl:choose>
961
962 <xsl:when test="$skip_start or $node/self::tei:text or $node/self::tei:body">
963 <xsl:sequence select="0"/>
964 </xsl:when>
965
966 <!-- handle p -->
967
968 <xsl:when test="$node/self::tei:p">
969 <xsl:variable name="my_pos" as="xs:integer" select="count($node/preceding-sibling::tei:p) + 1"/>
970 <xsl:variable name="preceding" as="node()*"
971 select="$node/ancestor::tei:body/tei:p[position() lt $my_pos]"/>
972
973 <xsl:choose>
974 <xsl:when test="count($preceding) eq 0">
975 <xsl:sequence select="0"/>
976 </xsl:when>
977 <xsl:otherwise>
978 <xsl:sequence select="sum(f:calc_offsets($preceding[last()],true())[$OFFSET_END],1)"/>
979
980 <!-- BUG danger: I am not sure if a "1" should rather be added after each p; let me try to handle that in the return value of the $length variable,
981 and make it sensitive to the skip_start parameter
982
983 I will then have to remove the ",1" from here!
984
985 -->
986
987 <!-- <xsl:variable name="last_corresps"
988 select="$preceding/descendant::tei:s[last()]/(descendant::tei:seg[count(@nkjp:rejected) eq 0 or @nkjp:rejected ne 'true'])[last()]/@corresp"
989 as="attribute(corresp)+"/>
990 <xsl:variable name="end_offsets" as="xs:integer+">
991 <xsl:for-each select="$last_corresps">
992 <xsl:variable name="numbers"
993 select="substring-after(substring-before(., ')'), ',')"/>
994 <xsl:sequence
995 select="xs:integer(substring-before($numbers, ',')) + xs:integer(substring-after($numbers, ','))"
996 />
997 </xsl:for-each>
998 </xsl:variable>
999 <xsl:sequence select="sum($end_offsets, 1)"/>
1000
1001 this is a non-recursive variant that may turn out to be much less cpu-intensive, not sure
1002 - but if it's plugged in, it will have to be adjusted to the current form of the recursive variant,
1003 because it hasn't been maintained since it got commented out
1004 -->
1005 </xsl:otherwise>
1006 </xsl:choose>
1007 </xsl:when>
1008
1009 <!-- handle s -->
1010
1011 <!-- the value for s gets counted since the start of the current p
1012 - so we look at the preceding s's
1013 + the preceding p's
1014 -->
1015 <xsl:when test="$node/self::tei:s">
1016 <!--<xsl:variable name="last_corresp"
1017 select="$node/descendant::tei:seg[count(@nkjp:rejected) eq 0 or @nkjp:rejected ne 'true'][last()]/attribute::corresp"
1018 as="attribute(corresp)"/>
1019 <xsl:variable name="numbers"
1020 select="substring-after(substring-before($last_corresp, ')'), ',')"/>
1021 <xsl:sequence
1022 select="xs:integer(substring-before($numbers, ',')) + xs:integer(substring-after($numbers, ','))"
1023 />
1024 -->
1025
1026 <xsl:variable name="internal_start" as="xs:integer">
1027 <xsl:variable name="my_pos" as="xs:integer" select="count($node/preceding-sibling::tei:s) + 1"/>
1028 <xsl:variable name="preceding" as="node()*"
1029 select="$node/ancestor::tei:p[1]/tei:s[position() lt $my_pos]"/>
1030
1031 <xsl:choose>
1032 <xsl:when test="count($preceding) eq 0">
1033 <xsl:sequence select="0"/>
1034 </xsl:when>
1035 <xsl:otherwise>
1036 <xsl:sequence select="f:calc_offsets($preceding[last()],true())[$OFFSET_END]"/>
1037 <!--<xsl:sequence select="sum(f:calc_offsets($preceding[last()],true())[$OFFSET_END],1)"/>-->
1038 <!-- again, CAREFUL ABOUT THE +1, it might need to vanish -->
1039 </xsl:otherwise>
1040 </xsl:choose>
1041 </xsl:variable>
1042
1043 <xsl:variable name="external_start" as="xs:integer" select="f:calc_offsets($node/ancestor::tei:p[1],false())[$OFFSET_START]"/>
1044
1045 <xsl:sequence select="$internal_start + $external_start"/>
1046 </xsl:when>
1047
1048 <!-- handle seg -->
1049
1050 <xsl:when test="$node/self::tei:seg">
1051 <!-- for segs, the s elements are irrelevant, and the local offset is immediately available on the @corresp -->
1052
1053 <xsl:variable name="numbers"
1054 select="substring-after(substring-before($node/@corresp, ')'), ',')"/>
1055
1056 <xsl:variable name="internal_start" select="xs:integer(substring-before($numbers, ','))"
1057 as="xs:integer"/>
1058 <xsl:variable name="external_start" as="xs:integer"
1059 select="f:calc_offsets($node/ancestor::tei:p[1], false())[$OFFSET_START]"/>
1060
1061 <xsl:if test="$node/self::tei:seg and count($node/@nkjp:rejected)">
1062
1063 <xsl:message select="'numbers: ' || $numbers"/>
1064 </xsl:if>
1065 <xsl:sequence select="$internal_start + $external_start"/>
1066 </xsl:when>
1067 </xsl:choose>
1068 </xsl:variable>
1069
1070 <xsl:variable name="length" as="xs:integer">
1071 <xsl:choose>
1072
1073 <xsl:when test="$node/self::tei:text or $node/self::tei:body">
1074 <xsl:variable name="last_corresps"
1075 select="$node/descendant::tei:p/descendant::tei:s[last()]/(descendant::tei:seg[count(@nkjp:rejected) eq 0 or @nkjp:rejected ne 'true'])[last()]/@corresp"
1076 as="attribute(corresp)+"/>
1077
1078 <xsl:variable name="end_offsets" as="xs:integer+">
1079 <xsl:for-each select="$last_corresps">
1080 <xsl:variable name="numbers" select="substring-after(substring-before(., ')'), ',')"/>
1081 <xsl:sequence
1082 select="xs:integer(substring-before($numbers, ',')) + xs:integer(substring-after($numbers, ','))"
1083 />
1084 </xsl:for-each>
1085 </xsl:variable>
1086
1087 <xsl:sequence select="sum($end_offsets)"/>
1088
1089 </xsl:when>
1090 <xsl:when test="$node/self::tei:p">
1091 <xsl:variable name="last_corresps"
1092 select="$node/descendant::tei:s[last()]/(descendant::tei:seg[count(@nkjp:rejected) eq 0 or @nkjp:rejected ne 'true'])[last()]/@corresp"
1093 as="attribute(corresp)+"/>
1094 <xsl:variable name="end_offsets" as="xs:integer+">
1095 <xsl:for-each select="$last_corresps">
1096 <xsl:variable name="numbers" select="substring-after(substring-before(., ')'), ',')"/>
1097 <xsl:sequence
1098 select="xs:integer(substring-before($numbers, ',')) + xs:integer(substring-after($numbers, ','))"
1099 />
1100 </xsl:for-each>
1101 </xsl:variable>
1102 <xsl:sequence select="sum($end_offsets)"/>
1103 </xsl:when>
1104
1105
1106
1107
1108 <xsl:when test="$node/self::tei:s">
1109 <xsl:variable name="last_corresp"
1110 select="$node/descendant::tei:seg[count(@nkjp:rejected) eq 0 or @nkjp:rejected ne 'true'][last()]/attribute::corresp"
1111 as="attribute(corresp)"/>
1112 <xsl:variable name="numbers"
1113 select="substring-after(substring-before($last_corresp, ')'), ',')"/>
1114 <xsl:sequence
1115 select="xs:integer(substring-before($numbers, ',')) + xs:integer(substring-after($numbers, ','))"
1116 />
1117 </xsl:when>
1118 <xsl:otherwise>
1119 <xsl:variable name="numbers"
1120 select="substring-after(substring-before($node/@corresp, ')'), ',')"/>
1121 <xsl:if test="$node/self::tei:seg and count($node/@nkjp:rejected)">
1122 <!-- REMOVE THIS -->
1123 <xsl:message select="'rejected: ' || $numbers"/>
1124 </xsl:if>
1125 <xsl:sequence
1126 select="xs:integer(substring-before($numbers, ',')) + xs:integer(substring-after($numbers, ','))"
1127 />
1128 </xsl:otherwise>
1129 </xsl:choose>
1130 </xsl:variable>
1131
1132 <xsl:message select="local-name($node) || '[' || count($node/preceding-sibling::*[local-name() eq local-name($node)])+1 || '] length: ' || $length || ' skip_start: ' || $skip_start"/>
1133
1134 <xsl:sequence select="$start, $start + $length -1 + xs:integer($skip_start)"/>
1135 </xsl:function>
1136
1137
Akron9a8ee3e2022-01-31 13:51:49 +01001138</xsl:stylesheet>
Piotr Banski6a4a2522022-05-24 01:16:47 +02001139
1140<!--<xsl:message select="('map:',serialize($map, map{'method':'adaptive'}))"/>-->