blob: 126752dcdbe158b3a773733f06976bbb26300a59 [file] [log] [blame]
Akron9a8ee3e2022-01-31 13:51:49 +01001<?xml version="1.0" encoding="UTF-8"?>
2<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
bansp5e2d1c02022-03-10 04:51:40 +01003 xmlns:xs="http://www.w3.org/2001/XMLSchema" xmlns:nkjp="http://www.nkjp.pl/ns/1.0"
4 xmlns:tei="http://www.tei-c.org/ns/1.0" xmlns:f="func"
Piotr Banskif8af3a92022-05-23 03:20:10 +02005 xmlns:fn="http://www.w3.org/2005/xpath-functions"
Piotr Banski6a4a2522022-05-24 01:16:47 +02006 xmlns:map="http://www.w3.org/2005/xpath-functions/map" exclude-result-prefixes="xs f fn map nkjp tei"
bansp5e2d1c02022-03-10 04:51:40 +01007 version="3.0" expand-text="yes">
Akron9a8ee3e2022-01-31 13:51:49 +01008
banspe726b4a2022-03-28 05:47:45 +02009
10<!-- PARAMETERS -->
bansp5e2d1c02022-03-10 04:51:40 +010011
bansp8f6700b2022-03-27 05:27:09 +020012 <xsl:param name="sourceDir" select="'test/resources/nkjp2korap_sample2'" as="xs:string"/>
banspd1bf1db2022-04-04 02:16:24 +020013 <!-- the directory containing NKJP files, in the form of a collection of text-level dirs
14 (that is how we know both the $corpusID and the $docID) -->
Akron9a8ee3e2022-01-31 13:51:49 +010015
bansp8f6700b2022-03-27 05:27:09 +020016 <xsl:param name="targetDir" select="'test/output'" as="xs:string"/>
banspd1bf1db2022-04-04 02:16:24 +020017 <!-- where the corpus/document/text/annotations hierarchy is going to be created -->
banspf2b24e62022-03-28 18:12:08 +020018
19 <xsl:param name="skip_docID" as="xs:string">
banspb5992532022-03-29 15:55:44 +020020 <xsl:value-of select="'HellerPodgladanie,IsakowiczZaleskiMoje,KolakowskiOco,MysliwskiKamien,WilkWilczy,ZycieWarszawy_Zycie'"/>
21 </xsl:param>
22 <!-- comma-separated list of document IDs to be skipped from processing
banspf2b24e62022-03-28 18:12:08 +020023 example: HellerPodgladanie,KOT
banspd1bf1db2022-04-04 02:16:24 +020024 no functionality beyond string identity is supported
25 (this is just for testing) -->
banspb5992532022-03-29 15:55:44 +020026
bansp8f6700b2022-03-27 05:27:09 +020027
bansp9dc10002022-05-17 22:33:34 +020028<!-- VARIABLES (= constants...) -->
banspe726b4a2022-03-28 05:47:45 +020029
30 <xsl:variable name="corpusID" as="xs:string" select="'NKJP'" static="yes"/>
31 <xsl:variable name="docID" as="xs:string" select="'NKJP'" static="yes"/>
bansp8f6700b2022-03-27 05:27:09 +020032
33 <xsl:variable name="targetCorpusDir_slashed" select="$targetDir || '/' || $corpusID || '/'" as="xs:string"/>
bansp5e2d1c02022-03-10 04:51:40 +010034
banspd1bf1db2022-04-04 02:16:24 +020035 <xsl:variable name="systemDoctypeI5" as="xs:string"
36 select="'http://corpora.ids-mannheim.de/I5/DTD/i5.dtd'" static="true"/>
bansp5e2d1c02022-03-10 04:51:40 +010037
banspd1bf1db2022-04-04 02:16:24 +020038 <xsl:variable name="publicDoctypeI5" as="xs:string" static="true"
39 select="'-//IDS//DTD I5 1.0//EN'"/>
bansp5e2d1c02022-03-10 04:51:40 +010040
banspd1bf1db2022-04-04 02:16:24 +020041 <xsl:variable name="KorAP_namespace" static="true" as="xs:string"
42 select="'http://ids-mannheim.de/ns/KorAP'"/>
bansp5e2d1c02022-03-10 04:51:40 +010043
bansp5f841732022-03-16 06:27:31 +010044 <xsl:variable name="KorAP-XML_version" select="'KorAP-0.4'" as="xs:string" static="true"/>
45 <!-- this is only a bit funny -->
46
banspe726b4a2022-03-28 05:47:45 +020047 <xsl:variable name="collection_params" as="xs:string" static="yes"
48 select="'recurse=yes;validation=strip;select=text.xml;content-type=application/xml;on-error=warning;xinclude=yes'"
49 />
50 <!-- see https://www.saxonica.com/documentation11/index.html#!sourcedocs/collections/collection-directories -->
51
52 <xsl:variable name="collection_of_text" select="collection($sourceDir || '?' || $collection_params)" as="document-node()+"/>
banspd1bf1db2022-04-04 02:16:24 +020053
54<!-- these two 'flags' are meant to increase the readability of the code
55 they are used for the output of the calc_offsets() function, where the
Piotr Banski4f4c2d22022-05-19 01:44:32 +020056 returned value is a sequence, (start, end)
57
58 remove together with the function!
59
60 -->
banspd1bf1db2022-04-04 02:16:24 +020061 <xsl:variable name="OFFSET_START" as="xs:integer" static="yes" select="1"/>
62 <xsl:variable name="OFFSET_END" as="xs:integer" static="yes" select="2"/>
banspb5992532022-03-29 15:55:44 +020063
64
banspe726b4a2022-03-28 05:47:45 +020065<!-- MODES -->
bansp5e2d1c02022-03-10 04:51:40 +010066
67 <xsl:mode name="corpus" on-no-match="deep-skip"/>
68 <xsl:mode name="text" on-no-match="deep-skip"/>
bansp9103aab2022-03-19 05:10:21 +010069 <xsl:mode name="header-text" on-no-match="text-only-copy"/>
Piotr Banski6a4a2522022-05-24 01:16:47 +020070 <xsl:mode use-accumulators="#all"/>
71
72 <xsl:accumulator name="elem-offset-seq" as="map(xs:string, item())+" initial-value="(map{})">
73 <xsl:accumulator-rule match="tei:w[parent::tei:seg[count(@nkjp:rejected) eq 0]]" phase="end">
74 <xsl:variable name="previous_index" as="xs:integer">
75 <xsl:choose>
76 <xsl:when test="count($value) eq 1">
77 <xsl:sequence select="0"/>
78 </xsl:when>
79 <xsl:otherwise>
80 <xsl:sequence select="map:get(head(reverse($value)),'end')"/>
81 </xsl:otherwise>
82 </xsl:choose>
83 </xsl:variable>
84
85<!--<xsl:message select="'previous_index:' || $previous_index"></xsl:message>-->
86
87 <xsl:sequence select="
88 $value,
89 map {
90 'id': string(parent::tei:seg/@xml:id),
91 'start': $previous_index + xs:integer(f:is_preceded_by_ws(parent::tei:seg)),
92 'end': string-length()
93 }"/>
94 </xsl:accumulator-rule>
95 </xsl:accumulator>
bansp5e2d1c02022-03-10 04:51:40 +010096
banspe726b4a2022-03-28 05:47:45 +020097
98 <!-- FUNCTIONS -->
99
bansp5f841732022-03-16 06:27:31 +0100100 <xsl:function name="f:compute_nesting" as="xs:integer">
banspd1bf1db2022-04-04 02:16:24 +0200101 <xsl:param name="node" as="element()"/>
bansp5f841732022-03-16 06:27:31 +0100102 <xsl:variable name="rel_depth"
103 select="count($node/ancestor-or-self::*[local-name(.) ne 'TEI'][local-name(.) ne 'teiCorpus'])"
104 as="xs:integer"/>
bansp5f841732022-03-16 06:27:31 +0100105 <xsl:sequence select="$rel_depth"/>
106 </xsl:function>
107
bansp9dc10002022-05-17 22:33:34 +0200108<xsl:function name="f:is_preceded_by_ws" as="xs:boolean">
109 <xsl:param name="node" as="element()"/>
110 <xsl:choose>
111 <xsl:when test="local-name($node) eq 'seg'">
Piotr Banski4f4c2d22022-05-19 01:44:32 +0200112 <xsl:sequence
113 select="not(exists($node/@nkjp:nps)) and not($node[count(preceding-sibling::tei:seg) eq 0]/ancestor::tei:s[count(preceding-sibling::tei:s) eq 0]/ancestor::tei:p[count(preceding-sibling::tei:p) eq 0])"
114 />
115 <!--and not($node/ancestor::tei:s[count(preceding-sibling::tei:s) eq 0]/ancestor::tei:p[count(preceding-sibling::tei:p) eq 0])-->
116
bansp9dc10002022-05-17 22:33:34 +0200117 </xsl:when>
118 <xsl:when test="local-name($node) eq 's'">
119 <xsl:message select="'s - prec s: ' || $node/preceding-sibling::tei:s"/>
120 <xsl:message select="'same s - prec p: ' || $node/ancestor::tei:p[1]/preceding-sibling::tei:p || '&#10;'"/>
121
122 <xsl:sequence select="exists($node/preceding-sibling::tei:s) or exists($node/ancestor::tei:p[1]/preceding-sibling::tei:p)"/>
123 </xsl:when>
124 <xsl:when test="local-name($node) eq 'p'">
125 <xsl:message select="'p : ' || $node/preceding-sibling::tei:p"></xsl:message>
126 <xsl:sequence select="exists($node/preceding-sibling::tei:p)"/>
127 </xsl:when>
128 <xsl:otherwise>
129 <xsl:message terminate="yes" select="'Wrong argument passed to f:is_preceded_by_ws(): ' || local-name($node) || ' Only p, s, seg are allowed.'"></xsl:message>
130 </xsl:otherwise>
131 </xsl:choose>
132</xsl:function>
banspd1bf1db2022-04-04 02:16:24 +0200133
banspb5992532022-03-29 15:55:44 +0200134
135<!-- UTILITY TEMPLATES -->
136
bansp9103aab2022-03-19 05:10:21 +0100137 <xsl:template match="@default" mode="#all"/>
bansp97ba7ce2022-03-26 05:14:06 +0100138 <!-- this is to delete some auto-inserted attribute throughout -->
bansp9103aab2022-03-19 05:10:21 +0100139
Piotr Banski6a4a2522022-05-24 01:16:47 +0200140 <xsl:template match="tei:w" mode="#all"/>
banspe726b4a2022-03-28 05:47:45 +0200141<!-- NKJP-SGJP has apparently resigned from standoff representations by adding <w> everywhere;
142 for the time being, we'll just stick to the standoff offsets, although that may need to
143 be revisited as the NKJP format has now began to stray from its schemas and assumptions -->
bansp8f6700b2022-03-27 05:27:09 +0200144
Piotr Banski6a4a2522022-05-24 01:16:47 +0200145 <!--<xsl:template match="tei:choice" mode="#all"/>-->
banspe726b4a2022-03-28 05:47:45 +0200146<!-- THIS IS ONLY TEMPORARY,
147 because an interesting challenge came up where I will
148 probably have to abandon straightforward mapping because of TOKENIZATION alternatives;
149
150 but now, I just want this stylesheet to work, even if it eats some occasional token (which it now does, 'komuÅ›' and 'czym' vanish)
151 -->
Piotr Banski6a4a2522022-05-24 01:16:47 +0200152
153 <!--fall-thru-->
154 <xsl:template match="tei:choice" mode="struct">
155 <xsl:apply-templates select="descendant::tei:seg"/>
156 </xsl:template>
banspb5992532022-03-29 15:55:44 +0200157
158 <!-- MAIN PROCESSING -->
159
160
bansp5e2d1c02022-03-10 04:51:40 +0100161 <xsl:template name="xsl:initial-template">
banspf2b24e62022-03-28 18:12:08 +0200162 <xsl:variable name="IDs_to_skip" select="tokenize($skip_docID,',')" as="xs:string*"/>
banspd1bf1db2022-04-04 02:16:24 +0200163
banspe726b4a2022-03-28 05:47:45 +0200164 <!-- we only want to call the template below once, and we process a random NKJP corpus file for that purpose,
bansp8f6700b2022-03-27 05:27:09 +0200165 because all we need is the main corpus header, and we can (should) get to that from any NKJP corpus document -->
166 <xsl:call-template name="create_corpus_header">
banspe726b4a2022-03-28 05:47:45 +0200167 <xsl:with-param name="text.xml" select="$collection_of_text[1]" as="document-node()"/>
bansp8f6700b2022-03-27 05:27:09 +0200168 <xsl:with-param name="target" select="$targetCorpusDir_slashed || 'header.xml'" as="xs:string"/>
169 </xsl:call-template>
170
banspe726b4a2022-03-28 05:47:45 +0200171 <xsl:for-each select="$collection_of_text">
172 <xsl:variable name="my_dir" as="xs:string" select="replace(base-uri(),'/text\.xml','')"/>
173 <xsl:variable name="my_textID" as="xs:string" select="tokenize($my_dir,'/')[last()]"/>
174 <xsl:variable name="ann_morphosyntax.uri" select="$my_dir || '/ann_morphosyntax.xml'" as="xs:string"/>
175 <xsl:variable name="ann_segmentation.uri" select="$my_dir || '/ann_segmentation.xml'" as="xs:string"/>
176
banspf2b24e62022-03-28 18:12:08 +0200177 <xsl:choose>
178 <xsl:when test="$my_textID = $IDs_to_skip"/>
bansp9dc10002022-05-17 22:33:34 +0200179 <!-- this is a utility step, for when we want to ignore some texts for any reason (debugging, selective update) -->
180
banspf2b24e62022-03-28 18:12:08 +0200181 <xsl:otherwise>
banspd1bf1db2022-04-04 02:16:24 +0200182
bansp9dc10002022-05-17 22:33:34 +0200183 <!--<xsl:message select="f:calc_offsets(doc($ann_segmentation.uri)//tei:body/tei:p[1]/tei:s[2]/tei:seg[1],false())"/>-->
banspd1bf1db2022-04-04 02:16:24 +0200184
bansp9dc10002022-05-17 22:33:34 +0200185<!-- <xsl:message select="doc($ann_segmentation.uri)//tei:body/tei:p[1]/tei:s[1]/tei:seg[1] || f:is_preceded_by_ws(doc($ann_segmentation.uri)//tei:body/tei:p[1]/tei:s[1]/tei:seg[1])"/>
186 <xsl:message select="doc($ann_segmentation.uri)//tei:body/tei:p[1]/tei:s[1]/tei:seg[3] || f:is_preceded_by_ws(doc($ann_segmentation.uri)//tei:body/tei:p[1]/tei:s[1]/tei:seg[3])"/>
187-->
188 <xsl:call-template name="process_single_sample">
banspf2b24e62022-03-28 18:12:08 +0200189 <xsl:with-param name="text.xml" as="document-node()" select="."/>
190 <xsl:with-param name="ann_morphosyntax.xml" as="document-node()"
191 select="doc($ann_morphosyntax.uri)"/>
192 <xsl:with-param name="ann_segmentation.xml" as="document-node()"
193 select="doc($ann_segmentation.uri)"/>
194 <xsl:with-param name="my_textID" select="$my_textID" as="xs:string"/>
bansp9dc10002022-05-17 22:33:34 +0200195 </xsl:call-template>
banspf2b24e62022-03-28 18:12:08 +0200196 </xsl:otherwise>
197 </xsl:choose>
banspe726b4a2022-03-28 05:47:45 +0200198 </xsl:for-each>
bansp8f6700b2022-03-27 05:27:09 +0200199 </xsl:template>
200
201 <xsl:template name="process_single_sample">
banspe726b4a2022-03-28 05:47:45 +0200202 <xsl:param name="text.xml" as="document-node()"/>
203 <xsl:param name="ann_morphosyntax.xml" as="document-node()"/>
204 <xsl:param name="ann_segmentation.xml" as="document-node()"/>
banspd1bf1db2022-04-04 02:16:24 +0200205 <xsl:param name="my_textID" as="xs:string" select="'0-BAD_textID'"/>
bansp9dc10002022-05-17 22:33:34 +0200206 <!-- empty textID should never happen, but if it does, it will be signalled at the top of the output -->
banspe726b4a2022-03-28 05:47:45 +0200207
208 <xsl:variable name="targetBaseDir" as="xs:string" select="$targetCorpusDir_slashed || $docID || '/' || $my_textID"/>
209
210 <xsl:variable name="compoundID" as="xs:string"
211 select="$corpusID || '_' || $docID || '.' || $my_textID"/>
212 <!-- this is what occurs in the text and data layers as @docid -->
213
Piotr Banski6a4a2522022-05-24 01:16:47 +0200214<!-- this is space devoted to recalculating word offsets on the basis of ann_segmentation.xml (rather than text.xml).
215 The results should be available to all annotation files, so we prepare a map here and send it off to whichever
216 annotation layer needs it. -->
217
218<!-- This is done is several steps, because I wanted to be able to look stuff up. There should probably be some
219 idiomatic way to reduce the memory footprint of these variables - I'll be happy to learn about it. -->
220
221 <!--<xsl:variable name="map_w" as="map(xs:untypedAtomic,item()+)">
222 <xsl:variable name="segs" select="$ann_segmentation.xml/tei:teiCorpus/tei:TEI/tei:text/tei:body/tei:p/tei:s//tei:seg[count(@nkjp:rejected) eq 0]" as="element()+"/>
223 <xsl:map>
224 <xsl:for-each-group select="$segs" group-by="ancestor::tei:p[1]/@xml:id">
225 <xsl:variable name="current-p" select="current-grouping-key()"/>
226 <xsl:for-each-group select="current-group()" group-by="ancestor::tei:s[1]/@xml:id">
227 <xsl:variable name="current-s" select="current-grouping-key()"/>
228 <xsl:for-each select="current-group()">
229 <xsl:map-entry key="@xml:id" select="$current-p, $current-s, position(), f:is_preceded_by_ws(.), normalize-space(tei:w)"/>
230 </xsl:for-each>
231 </xsl:for-each-group>
232 </xsl:for-each-group>
233 </xsl:map>
234 </xsl:variable>-->
235
236 <!--<xsl:variable name="map_p-s-w" as="map(xs:untypedAtomic,item()+)">
237 <xsl:map>
238 <xsl:for-each-group select="map:keys($map_w)" group-by="map:get($map_w, .)[1]">
239 <xsl:sort select="xs:integer(substring-before(substring-after(., 'segm_'), '.'))"
240 order="ascending"/>
241 <xsl:sort select="xs:integer(substring-before(substring-after(., '.'), '-'))"
242 order="ascending"/>
243 <xsl:variable name="current-p-pos" select="fn:position()" as="xs:integer"/>
244 <!-\- the above is used in the sentence loop, when we check if it's text-initial -\->
245 <xsl:variable name="current-p" select="fn:current-grouping-key()" as="xs:string"/> <!-\-xs:untypedAtomic-\->
246 <xsl:variable name="p-length" select="
247 sum(for $id in current-group()
248 return
249 string-length(map:get($map_w, $id)[5])) + count(current-group()) - xs:integer(position() ne 1) -
250 count(fn:filter(current-group(), function ($w-id) {
251 map:get($map_w, $w-id)[4] eq false()
252 }))"/>
253 <!-\- The general algorithm is:
254 * count and sum the lengths of all the words
255 * add 'whitespace' for all of them (= count them and add that), and then
256 * subtract whitespace for those of them that are not actually preceded by it
257 and if the 1st word is_preceded_by_ws then subtract 1
258 because identifying that 1st word would require an extra step, we're taking a shortcut via position() -
259 and that strongly depends on the presence of the xsl:sort instructions -\->
260
261 <xsl:message select="'sum: ' || sum( for $id in current-group() return string-length(map:get($map_w, $id)[5]) )"/>
262 <!-\-<xsl:message select="for $id in current-group() return (string-length(map:get($map_w, $id)[5]),map:get($map_w, $id)[4] )"/>-\->
263 <xsl:message select="'cur-group count: ' || count(fn:current-group())"/>
264 <!-\-<xsl:message select="fn:for-each(current-group(), function($w-id) { map:get($map_w,$w-id)[4] eq false() } )"></xsl:message>-\->
265 <xsl:message select="'subtract:' || count(fn:filter(current-group(), function($w-id) { map:get($map_w,$w-id)[4] eq false() } ))"></xsl:message>
266 <xsl:message select="'position: ' || position() || ', xs:integer(position() ne 1)=' || xs:integer(position() ne 1)"></xsl:message>
267 <xsl:message select="'p-length: ' || $p-length"/>
268
269 <xsl:map-entry key="current-grouping-key()" select="'p', position(), $p-length"/>
270
271 <xsl:message select="'p: ', $current-p || ' pos:' || position(), current-group()"/>
272
273 <xsl:for-each-group select="current-group()" group-by="map:get($map_w, .)[2]">
274 <xsl:sort select="xs:integer(substring-before(substring-after(., 'segm_'), '.'))"
275 order="ascending"/>
276 <xsl:sort select="xs:integer(substring-before(substring-after(., '.'), '-'))"
277 order="ascending"/>
278 <xsl:variable name="current-s" select="fn:current-grouping-key()" as="xs:string"/> <!-\-xs:untypedAtomic-\->
279 <xsl:variable name="s-length" select="
280 sum(for $id in current-group()
281 return
282 string-length(map:get($map_w, $id)[5])) + count(current-group()) - xs:integer($current-p-pos ne 1) -
283 count(fn:filter(current-group(), function ($w-id) {
284 map:get($map_w, $w-id)[4] eq false()
285 }))"/>
286
287
288 <xsl:map-entry key="current-grouping-key()" select="'s', position(), $s-length, $current-p"/>
289
290 <xsl:message select="'s: ', position(), current-group()"/>
291
292 <xsl:for-each select="current-group()">
293 <xsl:sort select="map:get($map_w, .)[3]" order="ascending"/>
294 <xsl:map-entry key="." select="'w', position(), string-length(map:get($map_w, .)[5]), $current-s, map:get($map_w, .)[4]"/>
295 <!-\- <xsl:message select="map:get($map_w, .)[5]"/>-\->
296 </xsl:for-each>
297 </xsl:for-each-group>
298 </xsl:for-each-group>
299 </xsl:map>
300 </xsl:variable>-->
301
302 <!--<xsl:variable name="offsets-p" as="map(xs:untypedAtomic,item()+)">
303 <xsl:map>
304 <xsl:for-each select="
305 fn:filter(map:keys($map_p-s-w), function ($ele) {
306 map:get($map_p-s-w, $ele)[1] eq 'p'
307 })">
308 <xsl:sort select="map:get($map_p-s-w, .)[2]"/>
309
310 <xsl:map-entry key="." select="map:get($map_p-s-w, .)[2]"/>
311
312
313 </xsl:for-each>
314 </xsl:map>
315 </xsl:variable>-->
316
317
318
319
320 <!-- <xsl:message select="'map_w size: ' || map:size($map_w)"/>
321 <xsl:message select="'map_s-p size: ' || map:size($map_p-s-w)"/>
322 <xsl:message select="'offsets size: ' || map:size($offsets-p)"/>-->
323
banspe726b4a2022-03-28 05:47:45 +0200324
bansp5e2d1c02022-03-10 04:51:40 +0100325 <xsl:call-template name="create_data">
bansp9dc10002022-05-17 22:33:34 +0200326 <!--<xsl:with-param name="text.xml" select="$text.xml" as="document-node()"/>-->
327 <xsl:with-param name="ann_segmentation.xml" select="$ann_segmentation.xml" as="document-node()"/>
banspe726b4a2022-03-28 05:47:45 +0200328 <xsl:with-param name="compoundID" select="$compoundID" as="xs:string"/>
329 <xsl:with-param name="target" select="$targetBaseDir || '/data.xml'" as="xs:string"/>
bansp5f841732022-03-16 06:27:31 +0100330 </xsl:call-template>
331
Piotr Banski4f4c2d22022-05-19 01:44:32 +0200332 <xsl:call-template name="create_struct">
banspe726b4a2022-03-28 05:47:45 +0200333 <xsl:with-param name="compoundID" select="$compoundID" as="xs:string"/>
bansp5f841732022-03-16 06:27:31 +0100334 <xsl:with-param name="ann_segmentation.xml" select="$ann_segmentation.xml"
335 as="document-node()"/>
banspe726b4a2022-03-28 05:47:45 +0200336 <xsl:with-param name="target" select="$targetBaseDir || '/struct/structure.xml'" as="xs:string"
bansp5f841732022-03-16 06:27:31 +0100337 />
Piotr Banski6a4a2522022-05-24 01:16:47 +0200338 <!--<xsl:with-param name="offsets" select="$offsets" as="map(xs:string, xs:integer+)"/>-->
bansp5f841732022-03-16 06:27:31 +0100339 </xsl:call-template>
Piotr Banski4f4c2d22022-05-19 01:44:32 +0200340
bansp9dc10002022-05-17 22:33:34 +0200341<!-- <xsl:call-template name="create_morpho">
bansp5f841732022-03-16 06:27:31 +0100342 <xsl:with-param name="text.xml" select="$text.xml" as="document-node()"/>
banspe726b4a2022-03-28 05:47:45 +0200343 <xsl:with-param name="compoundID" select="$compoundID" as="xs:string"/>
bansp3e5b20c2022-03-18 20:22:31 +0100344 <xsl:with-param name="ann_segmentation.xml" select="$ann_segmentation.xml"
345 as="document-node()"/>
bansp5f841732022-03-16 06:27:31 +0100346 <xsl:with-param name="ann_morphosyntax.xml" select="$ann_morphosyntax.xml"
347 as="document-node()"/>
banspe726b4a2022-03-28 05:47:45 +0200348 <xsl:with-param name="target" select="$targetBaseDir || '/nkjp/morpho.xml'" as="xs:string"/>
bansp5e2d1c02022-03-10 04:51:40 +0100349 </xsl:call-template>
bansp9dc10002022-05-17 22:33:34 +0200350-->
Piotr Banski6a4a2522022-05-24 01:16:47 +0200351 <!--<xsl:call-template name="create_text_header">
bansp5e2d1c02022-03-10 04:51:40 +0100352 <xsl:with-param name="text.xml" select="$text.xml" as="document-node()"/>
banspe726b4a2022-03-28 05:47:45 +0200353 <xsl:with-param name="compoundID" select="$compoundID" as="xs:string"/>
354 <xsl:with-param name="target" select="$targetBaseDir || '/header.xml'" as="xs:string"/>
Piotr Banski6a4a2522022-05-24 01:16:47 +0200355 </xsl:call-template>-->
356
357 <!--<xsl:message select="'size of offsets in process_single: ' || map:size($offsets)"/>-->
bansp5e2d1c02022-03-10 04:51:40 +0100358
bansp5e2d1c02022-03-10 04:51:40 +0100359 </xsl:template>
360
361 <!-- ************************** data.xml ******************* -->
362
363 <xsl:template name="create_data">
bansp9dc10002022-05-17 22:33:34 +0200364 <!--<xsl:param name="text.xml" as="document-node()"/>-->
365 <xsl:param name="ann_segmentation.xml" as="document-node()"/>
banspe726b4a2022-03-28 05:47:45 +0200366 <xsl:param name="compoundID" as="xs:string"/>
bansp5f841732022-03-16 06:27:31 +0100367 <xsl:param name="target" as="xs:string"/>
bansp5e2d1c02022-03-10 04:51:40 +0100368 <!-- create the data.xml file -->
369 <xsl:result-document encoding="UTF-8" method="xml" indent="yes"
bansp5f841732022-03-16 06:27:31 +0100370 xpath-default-namespace="{$KorAP_namespace}" href="{$target}">
bansp5e2d1c02022-03-10 04:51:40 +0100371
Akron9a8ee3e2022-01-31 13:51:49 +0100372 <xsl:processing-instruction name="xml-model">href=&quot;text.rng&quot; type=&quot;application/xml&quot; schematypens=&quot;http://relaxng.org/ns/structure/1.0&quot;</xsl:processing-instruction>
bansp5e2d1c02022-03-10 04:51:40 +0100373 <xsl:element name="raw_text" namespace="{$KorAP_namespace}">
bansp5f841732022-03-16 06:27:31 +0100374 <xsl:attribute name="docid" select="$compoundID"/>
bansp5e2d1c02022-03-10 04:51:40 +0100375 <xsl:element name="metadata" namespace="{$KorAP_namespace}">
376 <xsl:attribute name="file" select="'metadata.xml'"/>
377 </xsl:element>
378
379 <xsl:element name="text" namespace="{$KorAP_namespace}">
bansp9dc10002022-05-17 22:33:34 +0200380 <xsl:variable name="content" as="xs:string+">
Piotr Banski4f4c2d22022-05-19 01:44:32 +0200381 <xsl:for-each select="$ann_segmentation.xml/tei:teiCorpus/tei:TEI/tei:text/tei:body/tei:p/tei:s//tei:seg[count(@nkjp:rejected) eq 0]">
bansp9dc10002022-05-17 22:33:34 +0200382 <xsl:sequence select="
383 if (f:is_preceded_by_ws(.)) then
384 ' '
385 else
386 '', ./tei:w"/>
387 </xsl:for-each>
388 </xsl:variable>
389 <xsl:value-of select="string-join($content)"/>
bansp5e2d1c02022-03-10 04:51:40 +0100390 </xsl:element>
Akron9a8ee3e2022-01-31 13:51:49 +0100391 </xsl:element>
banspf79443e2022-02-25 14:25:33 +0100392 </xsl:result-document>
Akron9a8ee3e2022-01-31 13:51:49 +0100393 </xsl:template>
394
bansp5f841732022-03-16 06:27:31 +0100395 <!-- ************************** struct ******************* -->
396
397 <xsl:template name="create_struct">
banspe726b4a2022-03-28 05:47:45 +0200398 <xsl:param name="compoundID" as="xs:string"/>
bansp5f841732022-03-16 06:27:31 +0100399 <xsl:param name="ann_segmentation.xml" as="document-node()"/>
400 <xsl:param name="target" as="xs:string"/>
Piotr Banski6a4a2522022-05-24 01:16:47 +0200401 <!--<xsl:param name="offsets" as="map(xs:string, xs:integer+)"/>-->
Piotr Banski4f4c2d22022-05-19 01:44:32 +0200402
bansp5f841732022-03-16 06:27:31 +0100403 <xsl:result-document encoding="UTF-8" method="xml" indent="yes"
404 xpath-default-namespace="http://ids-mannheim.de/ns/KorAP" href="{$target}">
405 <xsl:processing-instruction name="xml-model">href=&quot;span.rng&quot; type=&quot;application/xml&quot; schematypens=&quot;http://relaxng.org/ns/structure/1.0&quot;</xsl:processing-instruction>
406 <xsl:element name="layer" namespace="{$KorAP_namespace}">
407 <xsl:attribute name="docid" select="$compoundID"/>
408 <xsl:attribute name="version" select="$KorAP-XML_version"/>
Piotr Banski6a4a2522022-05-24 01:16:47 +0200409
410 <!--<xsl:message select="'size of offsets in create_struct: ' || map:size($offsets)"/>-->
bansp5f841732022-03-16 06:27:31 +0100411
412 <xsl:element name="spanList" namespace="{$KorAP_namespace}">
Piotr Banski6a4a2522022-05-24 01:16:47 +0200413 <xsl:apply-templates select="$ann_segmentation.xml//tei:text" mode="struct">
414 <!--<xsl:with-param name="offsets" as="map(xs:string, xs:integer+)" tunnel="yes">
415 <xsl:map>
416 <xsl:for-each select="tail(fn:accumulator-after('elem-offset-seq'))">
417 <xsl:map-entry key="map:get(., 'id')" select="map:get(., 'start'), map:get(., 'end')"/>
418 </xsl:for-each>
419 </xsl:map>
420 </xsl:with-param>-->
421 </xsl:apply-templates>
bansp5f841732022-03-16 06:27:31 +0100422 </xsl:element>
423 </xsl:element>
424 </xsl:result-document>
425 </xsl:template>
426
427 <xsl:template match="tei:*" mode="struct">
428 <xsl:param name="ini" as="xs:integer" required="no" select="0"/>
429 <xsl:param name="fin" as="xs:integer" required="no" select="999999999"/>
430 <xsl:param name="index" as="xs:integer" required="no" select="1"/>
Piotr Banski6a4a2522022-05-24 01:16:47 +0200431 <xsl:param name="offsets" as="map(xs:string, xs:integer+)" tunnel="yes"/>
432
433 <xsl:variable name="offsets" as="map(xs:string, xs:integer+)">
434 <xsl:map>
435 <xsl:for-each select="tail(fn:accumulator-after('elem-offset-seq'))">
436 <xsl:map-entry key="map:get(., 'id')" select="map:get(., 'start'), map:get(., 'end')"/>
437 </xsl:for-each>
438 </xsl:map>
439 </xsl:variable>
440
441<!-- <xsl:message select="'size of offsets in tei:* ' || map:size($offsets)"/>-->
Piotr Banski4f4c2d22022-05-19 01:44:32 +0200442
bansp9dc10002022-05-17 22:33:34 +0200443 <!-- It's so spread out because I want to make sure to be able to look up the individual
bansp3e5b20c2022-03-18 20:22:31 +0100444 constituent values, should anything go wrong; optimization will come when it's worked against a larger dataset -->
bansp5f841732022-03-16 06:27:31 +0100445 <xsl:variable name="my_name" select="local-name()" as="xs:string"/>
446 <xsl:variable name="preceding" select="preceding-sibling::*[local-name(.) eq $my_name]"/>
447 <xsl:variable name="preceding-count" select="count($preceding)"/>
bansp9dc10002022-05-17 22:33:34 +0200448
bansp5f841732022-03-16 06:27:31 +0100449 <xsl:variable name="outside-preceding-count" as="xs:integer">
450 <xsl:choose>
451 <xsl:when test="self::tei:s or self::tei:p">
452 <xsl:choose>
453 <xsl:when test="$preceding-count">
454 <xsl:sequence select="
455 sum(for $p in $preceding
456 return
Piotr Banski4f4c2d22022-05-19 01:44:32 +0200457 count($p/descendant::*))"/> <!--mind @nkjp:rejected -->
bansp5f841732022-03-16 06:27:31 +0100458 </xsl:when>
459 <xsl:otherwise>
460 <xsl:sequence select="0"/>
461 </xsl:otherwise>
462 </xsl:choose>
463 </xsl:when>
464 <xsl:otherwise>
465 <xsl:sequence select="0"/>
466 </xsl:otherwise>
467 </xsl:choose>
468 </xsl:variable>
bansp9dc10002022-05-17 22:33:34 +0200469
bansp5f841732022-03-16 06:27:31 +0100470 <xsl:variable name="my_index" select="$index + 1 + $preceding-count + $outside-preceding-count"
471 as="xs:integer"/>
banspb5992532022-03-29 15:55:44 +0200472
473
474 <!--<xsl:copy select="//tei:seg[count(@nkjp:rejected) ne 0 and @nkjp:rejected ne 'true']"></xsl:copy>-->
bansp5f841732022-03-16 06:27:31 +0100475
476 <xsl:variable name="start" as="xs:integer">
477 <xsl:choose>
478 <xsl:when test="self::tei:text or self::tei:body">
479 <xsl:sequence select="0"/>
480 </xsl:when>
481 <xsl:when test="self::tei:p">
482 <xsl:variable name="first_corresp"
483 select="descendant::tei:s[1]/descendant::tei:seg[1]/attribute::corresp"
484 as="attribute(corresp)"/>
485 <xsl:variable name="numbers" select="substring-after(substring-before($first_corresp,')'),',')"/>
486 <xsl:sequence select="xs:integer(substring-before($numbers,','))"/>
487 </xsl:when>
488 <xsl:when test="self::tei:s">
489 <xsl:variable name="first_corresp"
490 select="descendant::tei:seg[1]/attribute::corresp"
491 as="attribute(corresp)"/>
492 <xsl:variable name="numbers" select="substring-after(substring-before($first_corresp,')'),',')"/>
493 <xsl:sequence select="xs:integer(substring-before($numbers,','))"/>
494 </xsl:when>
495 <xsl:when test="self::tei:seg">
496 <xsl:variable name="numbers" select="substring-after(substring-before(@corresp,')'),',')"/>
497 <xsl:sequence select="xs:integer(substring-before($numbers,','))"/>
498 </xsl:when>
Piotr Banski6a4a2522022-05-24 01:16:47 +0200499 <xsl:otherwise>
500 <xsl:message terminate="yes" select="'Element not handled: ' || fn:local-name()"/>
501 </xsl:otherwise>
bansp5f841732022-03-16 06:27:31 +0100502 </xsl:choose>
503 </xsl:variable>
Piotr Banski4f4c2d22022-05-19 01:44:32 +0200504
bansp5f841732022-03-16 06:27:31 +0100505 <xsl:variable name="end" as="xs:integer" select="f:calc_content_length(.)">
506 </xsl:variable>
bansp3e5b20c2022-03-18 20:22:31 +0100507
Piotr Banski6a4a2522022-05-24 01:16:47 +0200508
509
bansp5f841732022-03-16 06:27:31 +0100510 <xsl:element name="span" namespace="{$KorAP_namespace}">
511 <xsl:attribute name="id" select="'s' || $my_index"/>
512 <xsl:attribute name="from" select="$start"/>
513 <xsl:attribute name="to" select="$end"/>
Piotr Banski6a4a2522022-05-24 01:16:47 +0200514 <xsl:attribute name="accumulator" select="string-join(map:get($offsets,string(@xml:id)))"/>
bansp5f841732022-03-16 06:27:31 +0100515 <xsl:attribute name="l" select="f:compute_nesting(.)"/>
516 <xsl:element name="fs" namespace="http://www.tei-c.org/ns/1.0">
bansp3e5b20c2022-03-18 20:22:31 +0100517 <xsl:attribute name="type" select="'struct'"></xsl:attribute> <!-- STRUCT vs. LEX -->
bansp5f841732022-03-16 06:27:31 +0100518 <xsl:element name="f" namespace="http://www.tei-c.org/ns/1.0">
bansp3e5b20c2022-03-18 20:22:31 +0100519 <xsl:attribute name="name" select="'name'"/>
520 <xsl:value-of select="local-name()"/>
bansp5f841732022-03-16 06:27:31 +0100521 </xsl:element>
522 <xsl:if test="count(@*)">
523 <xsl:element name="f" namespace="http://www.tei-c.org/ns/1.0">
524 <xsl:attribute name="name" select="'attr'"/>
525 <xsl:element name="fs" namespace="http://www.tei-c.org/ns/1.0">
526 <xsl:attribute name="type" select="'attr'"/>
527 <xsl:for-each select="@*">
528 <xsl:element name="f" namespace="http://www.tei-c.org/ns/1.0">
529 <xsl:attribute name="name" select="local-name(.)"/>
530 <xsl:value-of select="."/>
531 </xsl:element>
532 </xsl:for-each>
533 </xsl:element>
534 </xsl:element>
535 </xsl:if>
536 </xsl:element>
537 </xsl:element>
538 <xsl:apply-templates mode="struct">
539 <xsl:with-param name="ini" select="$start" as="xs:integer"/>
540 <xsl:with-param name="fin" select="$end" as="xs:integer"/>
541 <xsl:with-param name="index" select="$my_index"/>
542 </xsl:apply-templates>
543 </xsl:template>
544
545 <!-- ************************** morpho ******************* -->
546
547 <xsl:template name="create_morpho">
548 <xsl:param name="text.xml" as="document-node()"/>
banspe726b4a2022-03-28 05:47:45 +0200549 <xsl:param name="compoundID" as="xs:string"/>
bansp3e5b20c2022-03-18 20:22:31 +0100550 <xsl:param name="ann_segmentation.xml" as="document-node()"/>
bansp5f841732022-03-16 06:27:31 +0100551 <xsl:param name="ann_morphosyntax.xml" as="document-node()"/>
552 <xsl:param name="target" as="xs:string"/>
553
554 <xsl:result-document encoding="UTF-8" method="xml" indent="yes"
555 xpath-default-namespace="http://ids-mannheim.de/ns/KorAP" href="{$target}">
556 <xsl:processing-instruction name="xml-model">href=&quot;span.rng&quot; type=&quot;application/xml&quot; schematypens=&quot;http://relaxng.org/ns/structure/1.0&quot;</xsl:processing-instruction>
bansp3e5b20c2022-03-18 20:22:31 +0100557 <xsl:element name="layer" namespace="{$KorAP_namespace}">
558 <xsl:attribute name="docid" select="$compoundID"/>
559 <xsl:attribute name="version" select="$KorAP-XML_version"/>
560
561 <xsl:element name="spanList" namespace="{$KorAP_namespace}">
562 <xsl:apply-templates select="$ann_segmentation.xml//tei:text" mode="morpho">
563 <xsl:with-param name="ann_morphosyntax.xml" select="$ann_morphosyntax.xml" as="document-node()"/>
564 </xsl:apply-templates>
565 </xsl:element>
566 </xsl:element>
bansp5f841732022-03-16 06:27:31 +0100567 </xsl:result-document>
568 </xsl:template>
569
bansp3e5b20c2022-03-18 20:22:31 +0100570 <xsl:template match="tei:*" mode="morpho">
571 <xsl:param name="ini" as="xs:integer" required="no" select="0"/>
572 <xsl:param name="fin" as="xs:integer" required="no" select="999999999"/>
573 <xsl:param name="index" as="xs:integer" required="no" select="1"/>
574 <xsl:param name="ann_morphosyntax.xml" as="document-node()"/>
575 <xsl:variable name="my_name" select="local-name()" as="xs:string"/>
576 <xsl:variable name="preceding" select="preceding-sibling::*[local-name(.) eq $my_name]"/>
577 <xsl:variable name="preceding-count" select="count($preceding)"/>
578 <xsl:variable name="outside-preceding-count" as="xs:integer">
579 <xsl:choose>
580 <xsl:when test="self::tei:s or self::tei:p">
581 <xsl:choose>
582 <xsl:when test="$preceding-count">
583 <xsl:sequence select="
584 sum(for $p in $preceding
585 return
586 count($p/descendant::*))"/>
587 </xsl:when>
588 <xsl:otherwise>
589 <xsl:sequence select="0"/>
590 </xsl:otherwise>
591 </xsl:choose>
592 </xsl:when>
593 <xsl:otherwise>
594 <xsl:sequence select="0"/>
595 </xsl:otherwise>
596 </xsl:choose>
597 </xsl:variable>
598 <xsl:variable name="my_index" select="$index + 1 + $preceding-count + $outside-preceding-count"
599 as="xs:integer"/>
600
601 <xsl:variable name="start" as="xs:integer">
602 <xsl:choose>
603 <xsl:when test="self::tei:text or self::tei:body">
604 <xsl:sequence select="0"/>
605 </xsl:when>
606 <xsl:when test="self::tei:p">
607 <xsl:variable name="first_corresp"
608 select="descendant::tei:s[1]/descendant::tei:seg[1]/attribute::corresp"
609 as="attribute(corresp)"/>
610 <xsl:variable name="numbers" select="substring-after(substring-before($first_corresp,')'),',')"/>
611 <xsl:sequence select="xs:integer(substring-before($numbers,','))"/>
612 </xsl:when>
613 <xsl:when test="self::tei:s">
614 <xsl:variable name="first_corresp"
615 select="descendant::tei:seg[1]/attribute::corresp"
616 as="attribute(corresp)"/>
617 <xsl:variable name="numbers" select="substring-after(substring-before($first_corresp,')'),',')"/>
618 <xsl:sequence select="xs:integer(substring-before($numbers,','))"/>
619 </xsl:when>
620 <!--<xsl:when test="self::tei:seg">
621 <xsl:variable name="numbers" select="substring-after(substring-before(@corresp,')'),',')"/>
622 <xsl:sequence select="xs:integer(substring-before($numbers,','))"/>
623 </xsl:when>-->
624 </xsl:choose>
625 </xsl:variable>
626 <xsl:variable name="end" as="xs:integer" select="f:calc_content_length(.)">
627 </xsl:variable>
628
629 <xsl:apply-templates mode="morpho">
630 <xsl:with-param name="ini" select="$start" as="xs:integer"/>
631 <xsl:with-param name="fin" select="$end" as="xs:integer"/>
632 <xsl:with-param name="index" select="$my_index"/>
633 <xsl:with-param name="ann_morphosyntax.xml" select="$ann_morphosyntax.xml" as="document-node()"/>
634 </xsl:apply-templates>
635 </xsl:template>
bansp5f841732022-03-16 06:27:31 +0100636
bansp3e5b20c2022-03-18 20:22:31 +0100637 <xsl:template match="tei:seg" mode="morpho">
638 <xsl:param name="ini" as="xs:integer" required="no" select="0"/>
639 <xsl:param name="fin" as="xs:integer" required="no" select="999999999"/>
640 <xsl:param name="index" as="xs:integer" required="no" select="1"/>
641 <xsl:param name="ann_morphosyntax.xml" as="document-node()"/>
642 <!-- I have made a major mess here, but it works... it's so spread out
643 because I wanted to make sure to be able to look up the individual
644 constituent values, should anything go wrong -->
645 <xsl:variable name="my_name" select="local-name()" as="xs:string"/>
646 <xsl:variable name="my_id" select="@xml:id" as="xs:string"/>
647 <xsl:variable name="my_morph-seg" as="node()" select="$ann_morphosyntax.xml//tei:seg[substring-after(@corresp,'#') eq $my_id]"/>
648 <xsl:variable name="my_disamb" select="$my_morph-seg//tei:fs/tei:f[@name eq 'disamb']" as="node()"/>
649 <xsl:variable name="my_choice-id" select="substring-after($my_disamb//tei:f[@name eq 'choice']/@fVal,'#')" as="xs:string"/>
650 <xsl:variable name="my_choice-lex" select="$my_morph-seg//tei:f[@name eq 'interps']/tei:fs[@type eq 'lex'][descendant::tei:symbol[@xml:id eq $my_choice-id]]" as="node()"/>
651 <xsl:variable name="chosen-msd" as="xs:string" select="$my_choice-lex/descendant::tei:symbol[@xml:id eq $my_choice-id]/@value"/>
652 <xsl:variable name="preceding" select="preceding-sibling::*[local-name(.) eq $my_name]"/>
653 <xsl:variable name="preceding-count" select="count($preceding)"/>
banspe726b4a2022-03-28 05:47:45 +0200654 <!--<xsl:variable name="outside-preceding-count" as="xs:integer">
bansp3e5b20c2022-03-18 20:22:31 +0100655 <xsl:choose>
banspe726b4a2022-03-28 05:47:45 +0200656 <xsl:when test="self::tei:s or self::tei:p"> <!-\- THIS NEEDS TO BE REVISITED AFTER THIS TEMPLATE HAS BECOME MORE SPECIFIC -\->
bansp3e5b20c2022-03-18 20:22:31 +0100657 <xsl:choose>
banspe726b4a2022-03-28 05:47:45 +0200658 <xsl:when test="$preceding-count"> commented out for now
bansp3e5b20c2022-03-18 20:22:31 +0100659 <xsl:sequence select="
660 sum(for $p in $preceding
661 return
662 count($p/descendant::*))"/>
663 </xsl:when>
664 <xsl:otherwise>
665 <xsl:sequence select="0"/>
666 </xsl:otherwise>
667 </xsl:choose>
668 </xsl:when>
669 <xsl:otherwise>
670 <xsl:sequence select="0"/>
671 </xsl:otherwise>
672 </xsl:choose>
banspe726b4a2022-03-28 05:47:45 +0200673 </xsl:variable>-->
674 <xsl:variable name="my_index" select="$index + 1 + $preceding-count" as="xs:integer"/>
bansp3e5b20c2022-03-18 20:22:31 +0100675
676 <xsl:variable name="start" as="xs:integer">
677 <xsl:variable name="numbers" select="substring-after(substring-before(@corresp,')'),',')"/>
678 <xsl:sequence select="xs:integer(substring-before($numbers,','))"/>
679 </xsl:variable>
680 <xsl:variable name="end" as="xs:integer" select="f:calc_content_length(.)">
681 </xsl:variable>
682 <xsl:element name="span" namespace="{$KorAP_namespace}">
683 <xsl:attribute name="id" select="'s' || $my_index"/>
684 <xsl:attribute name="from" select="$start"/>
685 <xsl:attribute name="to" select="$end"/>
686 <xsl:attribute name="l" select="f:compute_nesting(.)"/>
687 <xsl:element name="fs" namespace="http://www.tei-c.org/ns/1.0">
688 <xsl:attribute name="type" select="'lex'"/>
689 <xsl:element name="f" namespace="http://www.tei-c.org/ns/1.0">
690 <xsl:attribute name="name" select="'lex'"/>
691 <xsl:element name="fs" namespace="http://www.tei-c.org/ns/1.0">
692 <xsl:comment select="$my_morph-seg//tei:fs/tei:f[@name eq 'orth']/tei:string"/>
693
694
695 <xsl:element name="f" namespace="http://www.tei-c.org/ns/1.0">
696 <xsl:attribute name="name" select="'lemma'"/>
697 <xsl:value-of select="$my_choice-lex/tei:f[@name eq 'base']/tei:string"/>
698 </xsl:element>
699 <xsl:element name="f" namespace="http://www.tei-c.org/ns/1.0">
700 <xsl:attribute name="name" select="'pos'"/>
701 <xsl:value-of select="$my_choice-lex/tei:f[@name eq 'ctag']/tei:symbol/@value"/>
702 </xsl:element>
703 <xsl:if test="string-length($chosen-msd)">
704 <xsl:element name="f" namespace="http://www.tei-c.org/ns/1.0">
705 <xsl:attribute name="name" select="'msd'"/>
706 <xsl:value-of select="$chosen-msd"/>
707 </xsl:element>
708 </xsl:if>
709 <xsl:if test="$my_morph-seg//tei:fs/tei:f[@name eq 'nps']">
710 <xsl:element name="f" namespace="http://www.tei-c.org/ns/1.0">
711 <xsl:attribute name="name" select="'join'"/>
712 <xsl:value-of select="'left'"/>
713 </xsl:element>
714 </xsl:if>
715 </xsl:element>
716 </xsl:element>
717 </xsl:element>
718 </xsl:element>
banspe726b4a2022-03-28 05:47:45 +0200719 <xsl:apply-templates mode="morpho">
bansp3e5b20c2022-03-18 20:22:31 +0100720 <xsl:with-param name="ini" select="$start" as="xs:integer"/>
721 <xsl:with-param name="fin" select="$end" as="xs:integer"/>
722 <xsl:with-param name="index" select="$my_index"/>
723 <xsl:with-param name="ann_morphosyntax.xml" select="$ann_morphosyntax.xml" as="document-node()"/>
banspe726b4a2022-03-28 05:47:45 +0200724 </xsl:apply-templates>-->
bansp3e5b20c2022-03-18 20:22:31 +0100725 </xsl:template>
banspe726b4a2022-03-28 05:47:45 +0200726
bansp5f841732022-03-16 06:27:31 +0100727 <!-- ************************** TEXT header ******************* -->
728
729 <xsl:template name="create_text_header">
730 <xsl:param name="text.xml" as="document-node()"/>
banspe726b4a2022-03-28 05:47:45 +0200731 <xsl:param name="compoundID" as="xs:string"/>
bansp5f841732022-03-16 06:27:31 +0100732 <xsl:param name="target" as="xs:string"/>
733
734 <!-- create the local header.xml file -->
735 <xsl:result-document encoding="UTF-8" method="xml" indent="yes"
736 xpath-default-namespace="http://ids-mannheim.de/ns/KorAP" href="{$target}">
737
738 <idsHeader type="document" pattern="text" status="new" version="1.1" TEIform="teiHeader">
banspe726b4a2022-03-28 05:47:45 +0200739 <xsl:apply-templates select="$text.xml//tei:TEI/tei:teiHeader/tei:*" mode="text">
740 <xsl:with-param name="compoundID" as="xs:string" select="$compoundID" tunnel="yes"/>
741 </xsl:apply-templates>
bansp5f841732022-03-16 06:27:31 +0100742 </idsHeader>
743 </xsl:result-document>
744 </xsl:template>
745
746 <xsl:template match="tei:fileDesc" mode="text">
bansp9103aab2022-03-19 05:10:21 +0100747 <xsl:element name="{local-name()}">
bansp5f841732022-03-16 06:27:31 +0100748 <xsl:apply-templates mode="text"/>
bansp9103aab2022-03-19 05:10:21 +0100749 </xsl:element>
bansp5f841732022-03-16 06:27:31 +0100750 </xsl:template>
751
752 <xsl:template match="tei:title" mode="text">
753 <t.title>
754 <xsl:apply-templates/>
755 </t.title>
756 </xsl:template>
757
758 <xsl:template match="tei:titleStmt" mode="text">
banspe726b4a2022-03-28 05:47:45 +0200759 <xsl:param name="compoundID" as="xs:string" tunnel="yes"/>
bansp5f841732022-03-16 06:27:31 +0100760 <titleStmt>
761 <textSigle>
banspe726b4a2022-03-28 05:47:45 +0200762 <xsl:value-of select="$compoundID"/>
bansp5f841732022-03-16 06:27:31 +0100763 </textSigle>
764 <xsl:apply-templates mode="text"/>
765 </titleStmt>
766 </xsl:template>
767
bansp9103aab2022-03-19 05:10:21 +0100768 <xsl:template match="tei:publicationStmt" mode="text">
769 <xsl:element name="{local-name()}">
770 <xsl:apply-templates mode="text"/>
771 </xsl:element>
772 </xsl:template>
773
774 <xsl:template match="tei:availability" mode="text">
775 <xsl:element name="{local-name()}">
776 <xsl:apply-templates mode="text" select="@* | *"/>
777 </xsl:element>
778 </xsl:template>
779
780 <xsl:template match="tei:profileDesc" mode="text">
781 <xsl:element name="{local-name()}">
782 <xsl:apply-templates mode="text"/>
783 </xsl:element>
784 </xsl:template>
bansp5f841732022-03-16 06:27:31 +0100785
bansp9103aab2022-03-19 05:10:21 +0100786 <xsl:template match="tei:textClass" mode="text">
787 <xsl:element name="{local-name()}">
788 <xsl:apply-templates mode="text" select="@* | *"/>
789 </xsl:element>
790 </xsl:template>
791
792 <xsl:template match="tei:catRef" mode="text corpus">
793 <xsl:element name="{local-name()}">
794 <xsl:apply-templates mode="text" select="@* | *"/>
795 </xsl:element>
796 </xsl:template>
797
798 <xsl:template match="@status | @scheme | @target | @type | @xml:id[ancestor::tei:classDecl] | @xml:lang" mode="text corpus">
799 <xsl:copy-of select="."/>
800 </xsl:template>
801
802 <xsl:template match="tei:p" mode="text corpus">
803 <xsl:element name="{local-name()}">
804 <xsl:apply-templates mode="header-text"/>
805 </xsl:element>
806 </xsl:template>
807
808
809 <!-- OPTIMIZATION has to take modes into account -->
bansp5e2d1c02022-03-10 04:51:40 +0100810 <!-- ************************** CORPUS header ******************* -->
811 <xsl:template name="create_corpus_header">
812 <xsl:param name="text.xml" as="document-node()"/>
bansp5f841732022-03-16 06:27:31 +0100813 <xsl:param name="target" as="xs:string"/>
bansp5e2d1c02022-03-10 04:51:40 +0100814
815 <!-- create the corpus-level header.xml file -->
bansp5f841732022-03-16 06:27:31 +0100816 <xsl:result-document encoding="UTF-8" method="xml" indent="yes" href="{$target}">
bansp5e2d1c02022-03-10 04:51:40 +0100817
818 <!--doctype-public="{$publicDoctypeI5}"
819 doctype-system="{$systemDoctypeI5}">
820 these are, sadly, useless
821 -->
822
823 <idsHeader type="corpus" pattern="text" status="new" version="1.1" TEIform="teiHeader">
bansp9103aab2022-03-19 05:10:21 +0100824 <xsl:apply-templates select="$text.xml/tei:teiCorpus/tei:teiHeader/tei:*" mode="corpus"/>
bansp5e2d1c02022-03-10 04:51:40 +0100825 </idsHeader>
826 </xsl:result-document>
827 </xsl:template>
828
829 <xsl:template match="tei:fileDesc" mode="corpus">
bansp9103aab2022-03-19 05:10:21 +0100830 <xsl:element name="{local-name()}">
bansp5e2d1c02022-03-10 04:51:40 +0100831 <xsl:apply-templates mode="corpus"/>
bansp9103aab2022-03-19 05:10:21 +0100832 </xsl:element>
bansp5e2d1c02022-03-10 04:51:40 +0100833 </xsl:template>
bansp9103aab2022-03-19 05:10:21 +0100834
bansp5e2d1c02022-03-10 04:51:40 +0100835
836 <xsl:template match="tei:title" mode="corpus">
837 <c.title>
bansp9103aab2022-03-19 05:10:21 +0100838 <xsl:apply-templates mode="corpus" select="@*"/>
839 <xsl:apply-templates mode="header-text"/>
bansp5e2d1c02022-03-10 04:51:40 +0100840 </c.title>
841 </xsl:template>
842
843 <xsl:template match="tei:titleStmt" mode="corpus">
844 <titleStmt>
845 <korpusSigle>
846 <xsl:value-of select="$corpusID"/>
847 </korpusSigle>
848 <xsl:apply-templates mode="corpus"/>
849 </titleStmt>
850 </xsl:template>
851
bansp9103aab2022-03-19 05:10:21 +0100852 <xsl:template match="tei:publicationStmt" mode="corpus">
853 <xsl:element name="{local-name()}">
854 <xsl:apply-templates mode="corpus"/>
855 </xsl:element>
856 </xsl:template>
857
858 <xsl:template match="tei:availability" mode="corpus">
859 <xsl:element name="{local-name()}">
860 <xsl:apply-templates mode="corpus" select="@* | *"/>
861 </xsl:element>
862 </xsl:template>
863
864 <xsl:template match="tei:encodingDesc" mode="corpus">
865 <xsl:element name="{local-name()}">
866 <xsl:apply-templates mode="corpus"/>
867 </xsl:element>
868 </xsl:template>
869
870 <xsl:template match="tei:classDecl | tei:taxonomy | tei:category | tei:taxonomy/tei:bibl" mode="corpus">
871 <xsl:element name="{local-name()}">
872 <xsl:apply-templates mode="corpus" select="@* | *"/>
873 </xsl:element>
874 </xsl:template>
875
876 <xsl:template match="tei:bibl/tei:title | tei:edition | tei:desc" mode="corpus">
877 <xsl:element name="{local-name()}">
878 <xsl:apply-templates mode="corpus" select="@*"/>
879 <xsl:apply-templates mode="header-text"/>
880 </xsl:element>
881 </xsl:template>
882<!--
883 <xsl:template match="tei:textClass" mode="corpus">
884 <xsl:element name="{local-name()}">
885 <xsl:apply-templates mode="corpus" select="@* | *"/>
886 </xsl:element>
887 </xsl:template>
888
889 <xsl:template match="tei:catRef" mode="corpus">
890 <xsl:element name="{local-name()}">
891 <xsl:apply-templates mode="corpus" select="@* | *"/>
892 </xsl:element>
893 </xsl:template>
894-->
bansp5e2d1c02022-03-10 04:51:40 +0100895
896
897
898 <!-- this template can be called by the XSPEC test; TODO: find a way to call the main() template directly -->
899 <!-- I have not fully handled the param transmission, which would have to be kludged in just for the sake of XSPec,
900 because I'm disabling this for now, due to XSpec design issues; relevant links, a.o.:
901
902 https://stackoverflow.com/questions/64933277/what-is-the-cause-of-error-cannot-execute-xslresult-document-while-evaluating
903 https://www.balisage.net/Proceedings/vol25/html/Galtman01/BalisageVol25-Galtman01.html
904
905 In short: the internal design of XSpec forces kludges when one wants to use xsl:result-document in their stylesheets. But I don't
906 want to be strangled by kludges at the beginning of work, I've already lost quite a bit of time on this investigation,
907 I will therefore "just code" and then can think of externalizing bits of templates if we want to play with tests. For now,
908 I don't want to have to handle context items is a special way inside variables, etc., because I'm not sure it's worth it.
909
910 -->
911 <!--<xsl:template name="test_full">
912 <xsl:param name="corpusID"/>
913 <xsl:param name="docID"/>
914 <xsl:param name="textID"/>
915 <xsl:call-template name="xsl:initial-template"/>
916 </xsl:template>-->
917
Piotr Banski4f4c2d22022-05-19 01:44:32 +0200918 <xsl:function name="f:calc_content_length" as="xs:integer">
919 <xsl:param name="node" as="node()"/>
920 <xsl:choose>
921 <xsl:when test="$node/self::tei:text or $node/self::tei:body">
922 <xsl:variable name="last_corresp"
923 select="$node/descendant::tei:p[last()]/descendant::tei:s[last()]/descendant::tei:seg[count(@nkjp:rejected) eq 0 or @nkjp:rejected ne 'true'][last()]/attribute::corresp"
924 as="attribute(corresp)"/>
925 <xsl:variable name="numbers" select="substring-after(substring-before($last_corresp,')'),',')"/>
926 <xsl:sequence select="xs:integer(substring-before($numbers,',')) + xs:integer(substring-after($numbers,','))"/>
927 </xsl:when>
928 <xsl:when test="$node/self::tei:p">
929 <xsl:variable name="last_corresp"
930 select="$node/descendant::tei:s[last()]/descendant::tei:seg[count(@nkjp:rejected) eq 0 or @nkjp:rejected ne 'true'][last()]/attribute::corresp"
931 as="attribute(corresp)"/>
932 <xsl:variable name="numbers" select="substring-after(substring-before($last_corresp,')'),',')"/>
933 <xsl:sequence select="xs:integer(substring-before($numbers,',')) + xs:integer(substring-after($numbers,','))"/>
934 </xsl:when>
935 <xsl:when test="$node/self::tei:s">
936 <xsl:variable name="last_corresp"
937 select="$node/descendant::tei:seg[count(@nkjp:rejected) eq 0 or @nkjp:rejected ne 'true'][last()]/attribute::corresp"
938 as="attribute(corresp)"/>
939 <xsl:variable name="numbers" select="substring-after(substring-before($last_corresp,')'),',')"/>
940 <xsl:sequence select="xs:integer(substring-before($numbers,',')) + xs:integer(substring-after($numbers,','))"/>
941 </xsl:when>
942 <xsl:otherwise>
943 <xsl:variable name="numbers" select="substring-after(substring-before($node/@corresp,')'),',')"/>
944 <xsl:if test="$node/self::tei:seg and count($node/@nkjp:rejected)">
945 <!-- REMOVE THIS -->
946 <xsl:message select="$numbers"/>
947 </xsl:if>
948 <xsl:sequence select="xs:integer(substring-before($numbers,',')) + xs:integer(substring-after($numbers,','))"/>
949 </xsl:otherwise>
950 </xsl:choose>
951 </xsl:function>
952
953 <xsl:function name="f:calc_offsets" as="xs:integer+">
954 <xsl:param name="node" as="element()"/>
955 <xsl:param name="skip_start" as="xs:boolean" />
956
957 <xsl:variable name="start" as="xs:integer">
958 <xsl:choose>
959
960 <xsl:when test="$skip_start or $node/self::tei:text or $node/self::tei:body">
961 <xsl:sequence select="0"/>
962 </xsl:when>
963
964 <!-- handle p -->
965
966 <xsl:when test="$node/self::tei:p">
967 <xsl:variable name="my_pos" as="xs:integer" select="count($node/preceding-sibling::tei:p) + 1"/>
968 <xsl:variable name="preceding" as="node()*"
969 select="$node/ancestor::tei:body/tei:p[position() lt $my_pos]"/>
970
971 <xsl:choose>
972 <xsl:when test="count($preceding) eq 0">
973 <xsl:sequence select="0"/>
974 </xsl:when>
975 <xsl:otherwise>
976 <xsl:sequence select="sum(f:calc_offsets($preceding[last()],true())[$OFFSET_END],1)"/>
977
978 <!-- BUG danger: I am not sure if a "1" should rather be added after each p; let me try to handle that in the return value of the $length variable,
979 and make it sensitive to the skip_start parameter
980
981 I will then have to remove the ",1" from here!
982
983 -->
984
985 <!-- <xsl:variable name="last_corresps"
986 select="$preceding/descendant::tei:s[last()]/(descendant::tei:seg[count(@nkjp:rejected) eq 0 or @nkjp:rejected ne 'true'])[last()]/@corresp"
987 as="attribute(corresp)+"/>
988 <xsl:variable name="end_offsets" as="xs:integer+">
989 <xsl:for-each select="$last_corresps">
990 <xsl:variable name="numbers"
991 select="substring-after(substring-before(., ')'), ',')"/>
992 <xsl:sequence
993 select="xs:integer(substring-before($numbers, ',')) + xs:integer(substring-after($numbers, ','))"
994 />
995 </xsl:for-each>
996 </xsl:variable>
997 <xsl:sequence select="sum($end_offsets, 1)"/>
998
999 this is a non-recursive variant that may turn out to be much less cpu-intensive, not sure
1000 - but if it's plugged in, it will have to be adjusted to the current form of the recursive variant,
1001 because it hasn't been maintained since it got commented out
1002 -->
1003 </xsl:otherwise>
1004 </xsl:choose>
1005 </xsl:when>
1006
1007 <!-- handle s -->
1008
1009 <!-- the value for s gets counted since the start of the current p
1010 - so we look at the preceding s's
1011 + the preceding p's
1012 -->
1013 <xsl:when test="$node/self::tei:s">
1014 <!--<xsl:variable name="last_corresp"
1015 select="$node/descendant::tei:seg[count(@nkjp:rejected) eq 0 or @nkjp:rejected ne 'true'][last()]/attribute::corresp"
1016 as="attribute(corresp)"/>
1017 <xsl:variable name="numbers"
1018 select="substring-after(substring-before($last_corresp, ')'), ',')"/>
1019 <xsl:sequence
1020 select="xs:integer(substring-before($numbers, ',')) + xs:integer(substring-after($numbers, ','))"
1021 />
1022 -->
1023
1024 <xsl:variable name="internal_start" as="xs:integer">
1025 <xsl:variable name="my_pos" as="xs:integer" select="count($node/preceding-sibling::tei:s) + 1"/>
1026 <xsl:variable name="preceding" as="node()*"
1027 select="$node/ancestor::tei:p[1]/tei:s[position() lt $my_pos]"/>
1028
1029 <xsl:choose>
1030 <xsl:when test="count($preceding) eq 0">
1031 <xsl:sequence select="0"/>
1032 </xsl:when>
1033 <xsl:otherwise>
1034 <xsl:sequence select="f:calc_offsets($preceding[last()],true())[$OFFSET_END]"/>
1035 <!--<xsl:sequence select="sum(f:calc_offsets($preceding[last()],true())[$OFFSET_END],1)"/>-->
1036 <!-- again, CAREFUL ABOUT THE +1, it might need to vanish -->
1037 </xsl:otherwise>
1038 </xsl:choose>
1039 </xsl:variable>
1040
1041 <xsl:variable name="external_start" as="xs:integer" select="f:calc_offsets($node/ancestor::tei:p[1],false())[$OFFSET_START]"/>
1042
1043 <xsl:sequence select="$internal_start + $external_start"/>
1044 </xsl:when>
1045
1046 <!-- handle seg -->
1047
1048 <xsl:when test="$node/self::tei:seg">
1049 <!-- for segs, the s elements are irrelevant, and the local offset is immediately available on the @corresp -->
1050
1051 <xsl:variable name="numbers"
1052 select="substring-after(substring-before($node/@corresp, ')'), ',')"/>
1053
1054 <xsl:variable name="internal_start" select="xs:integer(substring-before($numbers, ','))"
1055 as="xs:integer"/>
1056 <xsl:variable name="external_start" as="xs:integer"
1057 select="f:calc_offsets($node/ancestor::tei:p[1], false())[$OFFSET_START]"/>
1058
1059 <xsl:if test="$node/self::tei:seg and count($node/@nkjp:rejected)">
1060
1061 <xsl:message select="'numbers: ' || $numbers"/>
1062 </xsl:if>
1063 <xsl:sequence select="$internal_start + $external_start"/>
1064 </xsl:when>
1065 </xsl:choose>
1066 </xsl:variable>
1067
1068 <xsl:variable name="length" as="xs:integer">
1069 <xsl:choose>
1070
1071 <xsl:when test="$node/self::tei:text or $node/self::tei:body">
1072 <xsl:variable name="last_corresps"
1073 select="$node/descendant::tei:p/descendant::tei:s[last()]/(descendant::tei:seg[count(@nkjp:rejected) eq 0 or @nkjp:rejected ne 'true'])[last()]/@corresp"
1074 as="attribute(corresp)+"/>
1075
1076 <xsl:variable name="end_offsets" as="xs:integer+">
1077 <xsl:for-each select="$last_corresps">
1078 <xsl:variable name="numbers" select="substring-after(substring-before(., ')'), ',')"/>
1079 <xsl:sequence
1080 select="xs:integer(substring-before($numbers, ',')) + xs:integer(substring-after($numbers, ','))"
1081 />
1082 </xsl:for-each>
1083 </xsl:variable>
1084
1085 <xsl:sequence select="sum($end_offsets)"/>
1086
1087 </xsl:when>
1088 <xsl:when test="$node/self::tei:p">
1089 <xsl:variable name="last_corresps"
1090 select="$node/descendant::tei:s[last()]/(descendant::tei:seg[count(@nkjp:rejected) eq 0 or @nkjp:rejected ne 'true'])[last()]/@corresp"
1091 as="attribute(corresp)+"/>
1092 <xsl:variable name="end_offsets" as="xs:integer+">
1093 <xsl:for-each select="$last_corresps">
1094 <xsl:variable name="numbers" select="substring-after(substring-before(., ')'), ',')"/>
1095 <xsl:sequence
1096 select="xs:integer(substring-before($numbers, ',')) + xs:integer(substring-after($numbers, ','))"
1097 />
1098 </xsl:for-each>
1099 </xsl:variable>
1100 <xsl:sequence select="sum($end_offsets)"/>
1101 </xsl:when>
1102
1103
1104
1105
1106 <xsl:when test="$node/self::tei:s">
1107 <xsl:variable name="last_corresp"
1108 select="$node/descendant::tei:seg[count(@nkjp:rejected) eq 0 or @nkjp:rejected ne 'true'][last()]/attribute::corresp"
1109 as="attribute(corresp)"/>
1110 <xsl:variable name="numbers"
1111 select="substring-after(substring-before($last_corresp, ')'), ',')"/>
1112 <xsl:sequence
1113 select="xs:integer(substring-before($numbers, ',')) + xs:integer(substring-after($numbers, ','))"
1114 />
1115 </xsl:when>
1116 <xsl:otherwise>
1117 <xsl:variable name="numbers"
1118 select="substring-after(substring-before($node/@corresp, ')'), ',')"/>
1119 <xsl:if test="$node/self::tei:seg and count($node/@nkjp:rejected)">
1120 <!-- REMOVE THIS -->
1121 <xsl:message select="'rejected: ' || $numbers"/>
1122 </xsl:if>
1123 <xsl:sequence
1124 select="xs:integer(substring-before($numbers, ',')) + xs:integer(substring-after($numbers, ','))"
1125 />
1126 </xsl:otherwise>
1127 </xsl:choose>
1128 </xsl:variable>
1129
1130 <xsl:message select="local-name($node) || '[' || count($node/preceding-sibling::*[local-name() eq local-name($node)])+1 || '] length: ' || $length || ' skip_start: ' || $skip_start"/>
1131
1132 <xsl:sequence select="$start, $start + $length -1 + xs:integer($skip_start)"/>
1133 </xsl:function>
1134
1135
Akron9a8ee3e2022-01-31 13:51:49 +01001136</xsl:stylesheet>
Piotr Banski6a4a2522022-05-24 01:16:47 +02001137
1138<!--<xsl:message select="('map:',serialize($map, map{'method':'adaptive'}))"/>-->