blob: 729d9162f06d644183e31534ab62ad2ddd8e1b0f [file] [log] [blame]
Akron9a8ee3e2022-01-31 13:51:49 +01001<?xml version="1.0" encoding="UTF-8"?>
2<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
bansp5e2d1c02022-03-10 04:51:40 +01003 xmlns:xs="http://www.w3.org/2001/XMLSchema" xmlns:nkjp="http://www.nkjp.pl/ns/1.0"
4 xmlns:tei="http://www.tei-c.org/ns/1.0" xmlns:f="func"
Piotr Banskif8af3a92022-05-23 03:20:10 +02005 xmlns:fn="http://www.w3.org/2005/xpath-functions"
Piotr Banski6a4a2522022-05-24 01:16:47 +02006 xmlns:map="http://www.w3.org/2005/xpath-functions/map" exclude-result-prefixes="xs f fn map nkjp tei"
bansp5e2d1c02022-03-10 04:51:40 +01007 version="3.0" expand-text="yes">
Akron9a8ee3e2022-01-31 13:51:49 +01008
banspe726b4a2022-03-28 05:47:45 +02009
10<!-- PARAMETERS -->
bansp5e2d1c02022-03-10 04:51:40 +010011
bansp8f6700b2022-03-27 05:27:09 +020012 <xsl:param name="sourceDir" select="'test/resources/nkjp2korap_sample2'" as="xs:string"/>
banspd1bf1db2022-04-04 02:16:24 +020013 <!-- the directory containing NKJP files, in the form of a collection of text-level dirs
14 (that is how we know both the $corpusID and the $docID) -->
Akron9a8ee3e2022-01-31 13:51:49 +010015
bansp8f6700b2022-03-27 05:27:09 +020016 <xsl:param name="targetDir" select="'test/output'" as="xs:string"/>
banspd1bf1db2022-04-04 02:16:24 +020017 <!-- where the corpus/document/text/annotations hierarchy is going to be created -->
banspf2b24e62022-03-28 18:12:08 +020018
19 <xsl:param name="skip_docID" as="xs:string">
banspb5992532022-03-29 15:55:44 +020020 <xsl:value-of select="'HellerPodgladanie,IsakowiczZaleskiMoje,KolakowskiOco,MysliwskiKamien,WilkWilczy,ZycieWarszawy_Zycie'"/>
21 </xsl:param>
22 <!-- comma-separated list of document IDs to be skipped from processing
banspf2b24e62022-03-28 18:12:08 +020023 example: HellerPodgladanie,KOT
banspd1bf1db2022-04-04 02:16:24 +020024 no functionality beyond string identity is supported
25 (this is just for testing) -->
banspb5992532022-03-29 15:55:44 +020026
bansp8f6700b2022-03-27 05:27:09 +020027
bansp9dc10002022-05-17 22:33:34 +020028<!-- VARIABLES (= constants...) -->
banspe726b4a2022-03-28 05:47:45 +020029
30 <xsl:variable name="corpusID" as="xs:string" select="'NKJP'" static="yes"/>
31 <xsl:variable name="docID" as="xs:string" select="'NKJP'" static="yes"/>
bansp8f6700b2022-03-27 05:27:09 +020032
33 <xsl:variable name="targetCorpusDir_slashed" select="$targetDir || '/' || $corpusID || '/'" as="xs:string"/>
bansp5e2d1c02022-03-10 04:51:40 +010034
banspd1bf1db2022-04-04 02:16:24 +020035 <xsl:variable name="systemDoctypeI5" as="xs:string"
36 select="'http://corpora.ids-mannheim.de/I5/DTD/i5.dtd'" static="true"/>
bansp5e2d1c02022-03-10 04:51:40 +010037
banspd1bf1db2022-04-04 02:16:24 +020038 <xsl:variable name="publicDoctypeI5" as="xs:string" static="true"
39 select="'-//IDS//DTD I5 1.0//EN'"/>
bansp5e2d1c02022-03-10 04:51:40 +010040
banspd1bf1db2022-04-04 02:16:24 +020041 <xsl:variable name="KorAP_namespace" static="true" as="xs:string"
42 select="'http://ids-mannheim.de/ns/KorAP'"/>
bansp5e2d1c02022-03-10 04:51:40 +010043
bansp5f841732022-03-16 06:27:31 +010044 <xsl:variable name="KorAP-XML_version" select="'KorAP-0.4'" as="xs:string" static="true"/>
45 <!-- this is only a bit funny -->
46
banspe726b4a2022-03-28 05:47:45 +020047 <xsl:variable name="collection_params" as="xs:string" static="yes"
48 select="'recurse=yes;validation=strip;select=text.xml;content-type=application/xml;on-error=warning;xinclude=yes'"
49 />
50 <!-- see https://www.saxonica.com/documentation11/index.html#!sourcedocs/collections/collection-directories -->
51
52 <xsl:variable name="collection_of_text" select="collection($sourceDir || '?' || $collection_params)" as="document-node()+"/>
banspd1bf1db2022-04-04 02:16:24 +020053
54<!-- these two 'flags' are meant to increase the readability of the code
55 they are used for the output of the calc_offsets() function, where the
Piotr Banski4f4c2d22022-05-19 01:44:32 +020056 returned value is a sequence, (start, end)
57
58 remove together with the function!
59
60 -->
banspd1bf1db2022-04-04 02:16:24 +020061 <xsl:variable name="OFFSET_START" as="xs:integer" static="yes" select="1"/>
62 <xsl:variable name="OFFSET_END" as="xs:integer" static="yes" select="2"/>
banspb5992532022-03-29 15:55:44 +020063
64
banspe726b4a2022-03-28 05:47:45 +020065<!-- MODES -->
bansp5e2d1c02022-03-10 04:51:40 +010066
67 <xsl:mode name="corpus" on-no-match="deep-skip"/>
68 <xsl:mode name="text" on-no-match="deep-skip"/>
bansp9103aab2022-03-19 05:10:21 +010069 <xsl:mode name="header-text" on-no-match="text-only-copy"/>
Piotr Banski6a4a2522022-05-24 01:16:47 +020070 <xsl:mode use-accumulators="#all"/>
71
Piotr Banski5fe4bae2022-05-24 02:40:13 +020072 <xsl:accumulator name="elem-offset-seq" as="map(xs:string, item()+)+" initial-value="(map{})">
Piotr Banski6a4a2522022-05-24 01:16:47 +020073 <xsl:accumulator-rule match="tei:w[parent::tei:seg[count(@nkjp:rejected) eq 0]]" phase="end">
74 <xsl:variable name="previous_index" as="xs:integer">
75 <xsl:choose>
76 <xsl:when test="count($value) eq 1">
77 <xsl:sequence select="0"/>
78 </xsl:when>
79 <xsl:otherwise>
Piotr Banski5fe4bae2022-05-24 02:40:13 +020080 <xsl:variable name="grab_the_tip" as="map(*)" select="head(reverse($value))"/>
81 <xsl:sequence select="map:get($grab_the_tip,map:keys($grab_the_tip)[1])[2]"/>
82 <xsl:message select="'previous element:' || map:keys($grab_the_tip)[1]"></xsl:message>
Piotr Banski6a4a2522022-05-24 01:16:47 +020083 </xsl:otherwise>
84 </xsl:choose>
85 </xsl:variable>
Piotr Banski5fe4bae2022-05-24 02:40:13 +020086 <xsl:variable name="our_base" as="xs:integer" select="$previous_index + xs:integer(f:is_preceded_by_ws(parent::tei:seg))"/>
Piotr Banski6a4a2522022-05-24 01:16:47 +020087
Piotr Banski5fe4bae2022-05-24 02:40:13 +020088 <xsl:message select="'previous_index:' || $previous_index || 'our_base: ' || $our_base"/>
Piotr Banski6a4a2522022-05-24 01:16:47 +020089
90 <xsl:sequence select="
91 $value,
92 map {
Piotr Banski5fe4bae2022-05-24 02:40:13 +020093 string(parent::tei:seg/@xml:id): ($our_base,$our_base + string-length() - 1)
Piotr Banski6a4a2522022-05-24 01:16:47 +020094 }"/>
Piotr Banski5fe4bae2022-05-24 02:40:13 +020095
96<!-- <xsl:sequence select="
97 $value,
98 map {
99 'id': string(parent::tei:seg/@xml:id),
100 'start': $previous_index + xs:integer(f:is_preceded_by_ws(parent::tei:seg)), messed up
101 'end': string-length()
102 }"/>-->
Piotr Banski6a4a2522022-05-24 01:16:47 +0200103 </xsl:accumulator-rule>
104 </xsl:accumulator>
bansp5e2d1c02022-03-10 04:51:40 +0100105
banspe726b4a2022-03-28 05:47:45 +0200106
107 <!-- FUNCTIONS -->
108
bansp5f841732022-03-16 06:27:31 +0100109 <xsl:function name="f:compute_nesting" as="xs:integer">
banspd1bf1db2022-04-04 02:16:24 +0200110 <xsl:param name="node" as="element()"/>
bansp5f841732022-03-16 06:27:31 +0100111 <xsl:variable name="rel_depth"
112 select="count($node/ancestor-or-self::*[local-name(.) ne 'TEI'][local-name(.) ne 'teiCorpus'])"
113 as="xs:integer"/>
bansp5f841732022-03-16 06:27:31 +0100114 <xsl:sequence select="$rel_depth"/>
115 </xsl:function>
116
bansp9dc10002022-05-17 22:33:34 +0200117<xsl:function name="f:is_preceded_by_ws" as="xs:boolean">
118 <xsl:param name="node" as="element()"/>
119 <xsl:choose>
120 <xsl:when test="local-name($node) eq 'seg'">
Piotr Banski4f4c2d22022-05-19 01:44:32 +0200121 <xsl:sequence
122 select="not(exists($node/@nkjp:nps)) and not($node[count(preceding-sibling::tei:seg) eq 0]/ancestor::tei:s[count(preceding-sibling::tei:s) eq 0]/ancestor::tei:p[count(preceding-sibling::tei:p) eq 0])"
123 />
124 <!--and not($node/ancestor::tei:s[count(preceding-sibling::tei:s) eq 0]/ancestor::tei:p[count(preceding-sibling::tei:p) eq 0])-->
125
bansp9dc10002022-05-17 22:33:34 +0200126 </xsl:when>
127 <xsl:when test="local-name($node) eq 's'">
128 <xsl:message select="'s - prec s: ' || $node/preceding-sibling::tei:s"/>
129 <xsl:message select="'same s - prec p: ' || $node/ancestor::tei:p[1]/preceding-sibling::tei:p || '&#10;'"/>
130
131 <xsl:sequence select="exists($node/preceding-sibling::tei:s) or exists($node/ancestor::tei:p[1]/preceding-sibling::tei:p)"/>
132 </xsl:when>
133 <xsl:when test="local-name($node) eq 'p'">
134 <xsl:message select="'p : ' || $node/preceding-sibling::tei:p"></xsl:message>
135 <xsl:sequence select="exists($node/preceding-sibling::tei:p)"/>
136 </xsl:when>
137 <xsl:otherwise>
138 <xsl:message terminate="yes" select="'Wrong argument passed to f:is_preceded_by_ws(): ' || local-name($node) || ' Only p, s, seg are allowed.'"></xsl:message>
139 </xsl:otherwise>
140 </xsl:choose>
141</xsl:function>
banspd1bf1db2022-04-04 02:16:24 +0200142
banspb5992532022-03-29 15:55:44 +0200143
144<!-- UTILITY TEMPLATES -->
145
bansp9103aab2022-03-19 05:10:21 +0100146 <xsl:template match="@default" mode="#all"/>
bansp97ba7ce2022-03-26 05:14:06 +0100147 <!-- this is to delete some auto-inserted attribute throughout -->
bansp9103aab2022-03-19 05:10:21 +0100148
Piotr Banski6a4a2522022-05-24 01:16:47 +0200149 <xsl:template match="tei:w" mode="#all"/>
banspe726b4a2022-03-28 05:47:45 +0200150<!-- NKJP-SGJP has apparently resigned from standoff representations by adding <w> everywhere;
151 for the time being, we'll just stick to the standoff offsets, although that may need to
152 be revisited as the NKJP format has now began to stray from its schemas and assumptions -->
bansp8f6700b2022-03-27 05:27:09 +0200153
Piotr Banski6a4a2522022-05-24 01:16:47 +0200154 <!--<xsl:template match="tei:choice" mode="#all"/>-->
banspe726b4a2022-03-28 05:47:45 +0200155<!-- THIS IS ONLY TEMPORARY,
156 because an interesting challenge came up where I will
157 probably have to abandon straightforward mapping because of TOKENIZATION alternatives;
158
159 but now, I just want this stylesheet to work, even if it eats some occasional token (which it now does, 'komuÅ›' and 'czym' vanish)
160 -->
Piotr Banski6a4a2522022-05-24 01:16:47 +0200161
162 <!--fall-thru-->
163 <xsl:template match="tei:choice" mode="struct">
164 <xsl:apply-templates select="descendant::tei:seg"/>
165 </xsl:template>
banspb5992532022-03-29 15:55:44 +0200166
167 <!-- MAIN PROCESSING -->
168
169
bansp5e2d1c02022-03-10 04:51:40 +0100170 <xsl:template name="xsl:initial-template">
banspf2b24e62022-03-28 18:12:08 +0200171 <xsl:variable name="IDs_to_skip" select="tokenize($skip_docID,',')" as="xs:string*"/>
banspd1bf1db2022-04-04 02:16:24 +0200172
banspe726b4a2022-03-28 05:47:45 +0200173 <!-- we only want to call the template below once, and we process a random NKJP corpus file for that purpose,
bansp8f6700b2022-03-27 05:27:09 +0200174 because all we need is the main corpus header, and we can (should) get to that from any NKJP corpus document -->
175 <xsl:call-template name="create_corpus_header">
banspe726b4a2022-03-28 05:47:45 +0200176 <xsl:with-param name="text.xml" select="$collection_of_text[1]" as="document-node()"/>
bansp8f6700b2022-03-27 05:27:09 +0200177 <xsl:with-param name="target" select="$targetCorpusDir_slashed || 'header.xml'" as="xs:string"/>
178 </xsl:call-template>
179
banspe726b4a2022-03-28 05:47:45 +0200180 <xsl:for-each select="$collection_of_text">
181 <xsl:variable name="my_dir" as="xs:string" select="replace(base-uri(),'/text\.xml','')"/>
182 <xsl:variable name="my_textID" as="xs:string" select="tokenize($my_dir,'/')[last()]"/>
183 <xsl:variable name="ann_morphosyntax.uri" select="$my_dir || '/ann_morphosyntax.xml'" as="xs:string"/>
184 <xsl:variable name="ann_segmentation.uri" select="$my_dir || '/ann_segmentation.xml'" as="xs:string"/>
185
banspf2b24e62022-03-28 18:12:08 +0200186 <xsl:choose>
187 <xsl:when test="$my_textID = $IDs_to_skip"/>
bansp9dc10002022-05-17 22:33:34 +0200188 <!-- this is a utility step, for when we want to ignore some texts for any reason (debugging, selective update) -->
189
banspf2b24e62022-03-28 18:12:08 +0200190 <xsl:otherwise>
banspd1bf1db2022-04-04 02:16:24 +0200191
bansp9dc10002022-05-17 22:33:34 +0200192 <!--<xsl:message select="f:calc_offsets(doc($ann_segmentation.uri)//tei:body/tei:p[1]/tei:s[2]/tei:seg[1],false())"/>-->
banspd1bf1db2022-04-04 02:16:24 +0200193
bansp9dc10002022-05-17 22:33:34 +0200194<!-- <xsl:message select="doc($ann_segmentation.uri)//tei:body/tei:p[1]/tei:s[1]/tei:seg[1] || f:is_preceded_by_ws(doc($ann_segmentation.uri)//tei:body/tei:p[1]/tei:s[1]/tei:seg[1])"/>
195 <xsl:message select="doc($ann_segmentation.uri)//tei:body/tei:p[1]/tei:s[1]/tei:seg[3] || f:is_preceded_by_ws(doc($ann_segmentation.uri)//tei:body/tei:p[1]/tei:s[1]/tei:seg[3])"/>
196-->
197 <xsl:call-template name="process_single_sample">
banspf2b24e62022-03-28 18:12:08 +0200198 <xsl:with-param name="text.xml" as="document-node()" select="."/>
199 <xsl:with-param name="ann_morphosyntax.xml" as="document-node()"
200 select="doc($ann_morphosyntax.uri)"/>
201 <xsl:with-param name="ann_segmentation.xml" as="document-node()"
202 select="doc($ann_segmentation.uri)"/>
203 <xsl:with-param name="my_textID" select="$my_textID" as="xs:string"/>
bansp9dc10002022-05-17 22:33:34 +0200204 </xsl:call-template>
banspf2b24e62022-03-28 18:12:08 +0200205 </xsl:otherwise>
206 </xsl:choose>
banspe726b4a2022-03-28 05:47:45 +0200207 </xsl:for-each>
bansp8f6700b2022-03-27 05:27:09 +0200208 </xsl:template>
209
210 <xsl:template name="process_single_sample">
banspe726b4a2022-03-28 05:47:45 +0200211 <xsl:param name="text.xml" as="document-node()"/>
212 <xsl:param name="ann_morphosyntax.xml" as="document-node()"/>
213 <xsl:param name="ann_segmentation.xml" as="document-node()"/>
banspd1bf1db2022-04-04 02:16:24 +0200214 <xsl:param name="my_textID" as="xs:string" select="'0-BAD_textID'"/>
bansp9dc10002022-05-17 22:33:34 +0200215 <!-- empty textID should never happen, but if it does, it will be signalled at the top of the output -->
banspe726b4a2022-03-28 05:47:45 +0200216
217 <xsl:variable name="targetBaseDir" as="xs:string" select="$targetCorpusDir_slashed || $docID || '/' || $my_textID"/>
218
219 <xsl:variable name="compoundID" as="xs:string"
220 select="$corpusID || '_' || $docID || '.' || $my_textID"/>
221 <!-- this is what occurs in the text and data layers as @docid -->
222
Piotr Banski6a4a2522022-05-24 01:16:47 +0200223<!-- this is space devoted to recalculating word offsets on the basis of ann_segmentation.xml (rather than text.xml).
224 The results should be available to all annotation files, so we prepare a map here and send it off to whichever
225 annotation layer needs it. -->
226
227<!-- This is done is several steps, because I wanted to be able to look stuff up. There should probably be some
228 idiomatic way to reduce the memory footprint of these variables - I'll be happy to learn about it. -->
229
230 <!--<xsl:variable name="map_w" as="map(xs:untypedAtomic,item()+)">
231 <xsl:variable name="segs" select="$ann_segmentation.xml/tei:teiCorpus/tei:TEI/tei:text/tei:body/tei:p/tei:s//tei:seg[count(@nkjp:rejected) eq 0]" as="element()+"/>
232 <xsl:map>
233 <xsl:for-each-group select="$segs" group-by="ancestor::tei:p[1]/@xml:id">
234 <xsl:variable name="current-p" select="current-grouping-key()"/>
235 <xsl:for-each-group select="current-group()" group-by="ancestor::tei:s[1]/@xml:id">
236 <xsl:variable name="current-s" select="current-grouping-key()"/>
237 <xsl:for-each select="current-group()">
238 <xsl:map-entry key="@xml:id" select="$current-p, $current-s, position(), f:is_preceded_by_ws(.), normalize-space(tei:w)"/>
239 </xsl:for-each>
240 </xsl:for-each-group>
241 </xsl:for-each-group>
242 </xsl:map>
243 </xsl:variable>-->
244
245 <!--<xsl:variable name="map_p-s-w" as="map(xs:untypedAtomic,item()+)">
246 <xsl:map>
247 <xsl:for-each-group select="map:keys($map_w)" group-by="map:get($map_w, .)[1]">
248 <xsl:sort select="xs:integer(substring-before(substring-after(., 'segm_'), '.'))"
249 order="ascending"/>
250 <xsl:sort select="xs:integer(substring-before(substring-after(., '.'), '-'))"
251 order="ascending"/>
252 <xsl:variable name="current-p-pos" select="fn:position()" as="xs:integer"/>
253 <!-\- the above is used in the sentence loop, when we check if it's text-initial -\->
254 <xsl:variable name="current-p" select="fn:current-grouping-key()" as="xs:string"/> <!-\-xs:untypedAtomic-\->
255 <xsl:variable name="p-length" select="
256 sum(for $id in current-group()
257 return
258 string-length(map:get($map_w, $id)[5])) + count(current-group()) - xs:integer(position() ne 1) -
259 count(fn:filter(current-group(), function ($w-id) {
260 map:get($map_w, $w-id)[4] eq false()
261 }))"/>
262 <!-\- The general algorithm is:
263 * count and sum the lengths of all the words
264 * add 'whitespace' for all of them (= count them and add that), and then
265 * subtract whitespace for those of them that are not actually preceded by it
266 and if the 1st word is_preceded_by_ws then subtract 1
267 because identifying that 1st word would require an extra step, we're taking a shortcut via position() -
268 and that strongly depends on the presence of the xsl:sort instructions -\->
269
270 <xsl:message select="'sum: ' || sum( for $id in current-group() return string-length(map:get($map_w, $id)[5]) )"/>
271 <!-\-<xsl:message select="for $id in current-group() return (string-length(map:get($map_w, $id)[5]),map:get($map_w, $id)[4] )"/>-\->
272 <xsl:message select="'cur-group count: ' || count(fn:current-group())"/>
273 <!-\-<xsl:message select="fn:for-each(current-group(), function($w-id) { map:get($map_w,$w-id)[4] eq false() } )"></xsl:message>-\->
274 <xsl:message select="'subtract:' || count(fn:filter(current-group(), function($w-id) { map:get($map_w,$w-id)[4] eq false() } ))"></xsl:message>
275 <xsl:message select="'position: ' || position() || ', xs:integer(position() ne 1)=' || xs:integer(position() ne 1)"></xsl:message>
276 <xsl:message select="'p-length: ' || $p-length"/>
277
278 <xsl:map-entry key="current-grouping-key()" select="'p', position(), $p-length"/>
279
280 <xsl:message select="'p: ', $current-p || ' pos:' || position(), current-group()"/>
281
282 <xsl:for-each-group select="current-group()" group-by="map:get($map_w, .)[2]">
283 <xsl:sort select="xs:integer(substring-before(substring-after(., 'segm_'), '.'))"
284 order="ascending"/>
285 <xsl:sort select="xs:integer(substring-before(substring-after(., '.'), '-'))"
286 order="ascending"/>
287 <xsl:variable name="current-s" select="fn:current-grouping-key()" as="xs:string"/> <!-\-xs:untypedAtomic-\->
288 <xsl:variable name="s-length" select="
289 sum(for $id in current-group()
290 return
291 string-length(map:get($map_w, $id)[5])) + count(current-group()) - xs:integer($current-p-pos ne 1) -
292 count(fn:filter(current-group(), function ($w-id) {
293 map:get($map_w, $w-id)[4] eq false()
294 }))"/>
295
296
297 <xsl:map-entry key="current-grouping-key()" select="'s', position(), $s-length, $current-p"/>
298
299 <xsl:message select="'s: ', position(), current-group()"/>
300
301 <xsl:for-each select="current-group()">
302 <xsl:sort select="map:get($map_w, .)[3]" order="ascending"/>
303 <xsl:map-entry key="." select="'w', position(), string-length(map:get($map_w, .)[5]), $current-s, map:get($map_w, .)[4]"/>
304 <!-\- <xsl:message select="map:get($map_w, .)[5]"/>-\->
305 </xsl:for-each>
306 </xsl:for-each-group>
307 </xsl:for-each-group>
308 </xsl:map>
309 </xsl:variable>-->
310
311 <!--<xsl:variable name="offsets-p" as="map(xs:untypedAtomic,item()+)">
312 <xsl:map>
313 <xsl:for-each select="
314 fn:filter(map:keys($map_p-s-w), function ($ele) {
315 map:get($map_p-s-w, $ele)[1] eq 'p'
316 })">
317 <xsl:sort select="map:get($map_p-s-w, .)[2]"/>
318
319 <xsl:map-entry key="." select="map:get($map_p-s-w, .)[2]"/>
320
321
322 </xsl:for-each>
323 </xsl:map>
324 </xsl:variable>-->
325
326
327
328
329 <!-- <xsl:message select="'map_w size: ' || map:size($map_w)"/>
330 <xsl:message select="'map_s-p size: ' || map:size($map_p-s-w)"/>
331 <xsl:message select="'offsets size: ' || map:size($offsets-p)"/>-->
332
banspe726b4a2022-03-28 05:47:45 +0200333
bansp5e2d1c02022-03-10 04:51:40 +0100334 <xsl:call-template name="create_data">
bansp9dc10002022-05-17 22:33:34 +0200335 <!--<xsl:with-param name="text.xml" select="$text.xml" as="document-node()"/>-->
336 <xsl:with-param name="ann_segmentation.xml" select="$ann_segmentation.xml" as="document-node()"/>
banspe726b4a2022-03-28 05:47:45 +0200337 <xsl:with-param name="compoundID" select="$compoundID" as="xs:string"/>
338 <xsl:with-param name="target" select="$targetBaseDir || '/data.xml'" as="xs:string"/>
bansp5f841732022-03-16 06:27:31 +0100339 </xsl:call-template>
340
Piotr Banski4f4c2d22022-05-19 01:44:32 +0200341 <xsl:call-template name="create_struct">
banspe726b4a2022-03-28 05:47:45 +0200342 <xsl:with-param name="compoundID" select="$compoundID" as="xs:string"/>
bansp5f841732022-03-16 06:27:31 +0100343 <xsl:with-param name="ann_segmentation.xml" select="$ann_segmentation.xml"
344 as="document-node()"/>
banspe726b4a2022-03-28 05:47:45 +0200345 <xsl:with-param name="target" select="$targetBaseDir || '/struct/structure.xml'" as="xs:string"
bansp5f841732022-03-16 06:27:31 +0100346 />
Piotr Banski6a4a2522022-05-24 01:16:47 +0200347 <!--<xsl:with-param name="offsets" select="$offsets" as="map(xs:string, xs:integer+)"/>-->
bansp5f841732022-03-16 06:27:31 +0100348 </xsl:call-template>
Piotr Banski4f4c2d22022-05-19 01:44:32 +0200349
bansp9dc10002022-05-17 22:33:34 +0200350<!-- <xsl:call-template name="create_morpho">
bansp5f841732022-03-16 06:27:31 +0100351 <xsl:with-param name="text.xml" select="$text.xml" as="document-node()"/>
banspe726b4a2022-03-28 05:47:45 +0200352 <xsl:with-param name="compoundID" select="$compoundID" as="xs:string"/>
bansp3e5b20c2022-03-18 20:22:31 +0100353 <xsl:with-param name="ann_segmentation.xml" select="$ann_segmentation.xml"
354 as="document-node()"/>
bansp5f841732022-03-16 06:27:31 +0100355 <xsl:with-param name="ann_morphosyntax.xml" select="$ann_morphosyntax.xml"
356 as="document-node()"/>
banspe726b4a2022-03-28 05:47:45 +0200357 <xsl:with-param name="target" select="$targetBaseDir || '/nkjp/morpho.xml'" as="xs:string"/>
bansp5e2d1c02022-03-10 04:51:40 +0100358 </xsl:call-template>
bansp9dc10002022-05-17 22:33:34 +0200359-->
Piotr Banski6a4a2522022-05-24 01:16:47 +0200360 <!--<xsl:call-template name="create_text_header">
bansp5e2d1c02022-03-10 04:51:40 +0100361 <xsl:with-param name="text.xml" select="$text.xml" as="document-node()"/>
banspe726b4a2022-03-28 05:47:45 +0200362 <xsl:with-param name="compoundID" select="$compoundID" as="xs:string"/>
363 <xsl:with-param name="target" select="$targetBaseDir || '/header.xml'" as="xs:string"/>
Piotr Banski6a4a2522022-05-24 01:16:47 +0200364 </xsl:call-template>-->
365
366 <!--<xsl:message select="'size of offsets in process_single: ' || map:size($offsets)"/>-->
bansp5e2d1c02022-03-10 04:51:40 +0100367
bansp5e2d1c02022-03-10 04:51:40 +0100368 </xsl:template>
369
370 <!-- ************************** data.xml ******************* -->
371
372 <xsl:template name="create_data">
bansp9dc10002022-05-17 22:33:34 +0200373 <!--<xsl:param name="text.xml" as="document-node()"/>-->
374 <xsl:param name="ann_segmentation.xml" as="document-node()"/>
banspe726b4a2022-03-28 05:47:45 +0200375 <xsl:param name="compoundID" as="xs:string"/>
bansp5f841732022-03-16 06:27:31 +0100376 <xsl:param name="target" as="xs:string"/>
bansp5e2d1c02022-03-10 04:51:40 +0100377 <!-- create the data.xml file -->
378 <xsl:result-document encoding="UTF-8" method="xml" indent="yes"
bansp5f841732022-03-16 06:27:31 +0100379 xpath-default-namespace="{$KorAP_namespace}" href="{$target}">
bansp5e2d1c02022-03-10 04:51:40 +0100380
Akron9a8ee3e2022-01-31 13:51:49 +0100381 <xsl:processing-instruction name="xml-model">href=&quot;text.rng&quot; type=&quot;application/xml&quot; schematypens=&quot;http://relaxng.org/ns/structure/1.0&quot;</xsl:processing-instruction>
bansp5e2d1c02022-03-10 04:51:40 +0100382 <xsl:element name="raw_text" namespace="{$KorAP_namespace}">
bansp5f841732022-03-16 06:27:31 +0100383 <xsl:attribute name="docid" select="$compoundID"/>
bansp5e2d1c02022-03-10 04:51:40 +0100384 <xsl:element name="metadata" namespace="{$KorAP_namespace}">
385 <xsl:attribute name="file" select="'metadata.xml'"/>
386 </xsl:element>
387
388 <xsl:element name="text" namespace="{$KorAP_namespace}">
bansp9dc10002022-05-17 22:33:34 +0200389 <xsl:variable name="content" as="xs:string+">
Piotr Banski4f4c2d22022-05-19 01:44:32 +0200390 <xsl:for-each select="$ann_segmentation.xml/tei:teiCorpus/tei:TEI/tei:text/tei:body/tei:p/tei:s//tei:seg[count(@nkjp:rejected) eq 0]">
bansp9dc10002022-05-17 22:33:34 +0200391 <xsl:sequence select="
392 if (f:is_preceded_by_ws(.)) then
393 ' '
394 else
395 '', ./tei:w"/>
396 </xsl:for-each>
397 </xsl:variable>
398 <xsl:value-of select="string-join($content)"/>
bansp5e2d1c02022-03-10 04:51:40 +0100399 </xsl:element>
Akron9a8ee3e2022-01-31 13:51:49 +0100400 </xsl:element>
banspf79443e2022-02-25 14:25:33 +0100401 </xsl:result-document>
Akron9a8ee3e2022-01-31 13:51:49 +0100402 </xsl:template>
403
bansp5f841732022-03-16 06:27:31 +0100404 <!-- ************************** struct ******************* -->
405
406 <xsl:template name="create_struct">
banspe726b4a2022-03-28 05:47:45 +0200407 <xsl:param name="compoundID" as="xs:string"/>
bansp5f841732022-03-16 06:27:31 +0100408 <xsl:param name="ann_segmentation.xml" as="document-node()"/>
409 <xsl:param name="target" as="xs:string"/>
Piotr Banski6a4a2522022-05-24 01:16:47 +0200410 <!--<xsl:param name="offsets" as="map(xs:string, xs:integer+)"/>-->
Piotr Banski4f4c2d22022-05-19 01:44:32 +0200411
bansp5f841732022-03-16 06:27:31 +0100412 <xsl:result-document encoding="UTF-8" method="xml" indent="yes"
413 xpath-default-namespace="http://ids-mannheim.de/ns/KorAP" href="{$target}">
414 <xsl:processing-instruction name="xml-model">href=&quot;span.rng&quot; type=&quot;application/xml&quot; schematypens=&quot;http://relaxng.org/ns/structure/1.0&quot;</xsl:processing-instruction>
415 <xsl:element name="layer" namespace="{$KorAP_namespace}">
416 <xsl:attribute name="docid" select="$compoundID"/>
417 <xsl:attribute name="version" select="$KorAP-XML_version"/>
Piotr Banski6a4a2522022-05-24 01:16:47 +0200418
419 <!--<xsl:message select="'size of offsets in create_struct: ' || map:size($offsets)"/>-->
bansp5f841732022-03-16 06:27:31 +0100420
421 <xsl:element name="spanList" namespace="{$KorAP_namespace}">
Piotr Banski6a4a2522022-05-24 01:16:47 +0200422 <xsl:apply-templates select="$ann_segmentation.xml//tei:text" mode="struct">
423 <!--<xsl:with-param name="offsets" as="map(xs:string, xs:integer+)" tunnel="yes">
424 <xsl:map>
425 <xsl:for-each select="tail(fn:accumulator-after('elem-offset-seq'))">
426 <xsl:map-entry key="map:get(., 'id')" select="map:get(., 'start'), map:get(., 'end')"/>
427 </xsl:for-each>
428 </xsl:map>
429 </xsl:with-param>-->
430 </xsl:apply-templates>
bansp5f841732022-03-16 06:27:31 +0100431 </xsl:element>
432 </xsl:element>
433 </xsl:result-document>
434 </xsl:template>
435
436 <xsl:template match="tei:*" mode="struct">
437 <xsl:param name="ini" as="xs:integer" required="no" select="0"/>
438 <xsl:param name="fin" as="xs:integer" required="no" select="999999999"/>
439 <xsl:param name="index" as="xs:integer" required="no" select="1"/>
Piotr Banski5fe4bae2022-05-24 02:40:13 +0200440 <!--<xsl:param name="offsets" as="map(xs:string, xs:integer+)" tunnel="yes"/>-->
Piotr Banski6a4a2522022-05-24 01:16:47 +0200441
Piotr Banski5fe4bae2022-05-24 02:40:13 +0200442 <xsl:variable name="offsets" as="map(xs:string, xs:integer+)"
443 select="map:merge(tail(fn:accumulator-after('elem-offset-seq')))"/>
Piotr Banski6a4a2522022-05-24 01:16:47 +0200444
445<!-- <xsl:message select="'size of offsets in tei:* ' || map:size($offsets)"/>-->
Piotr Banski4f4c2d22022-05-19 01:44:32 +0200446
bansp9dc10002022-05-17 22:33:34 +0200447 <!-- It's so spread out because I want to make sure to be able to look up the individual
bansp3e5b20c2022-03-18 20:22:31 +0100448 constituent values, should anything go wrong; optimization will come when it's worked against a larger dataset -->
bansp5f841732022-03-16 06:27:31 +0100449 <xsl:variable name="my_name" select="local-name()" as="xs:string"/>
450 <xsl:variable name="preceding" select="preceding-sibling::*[local-name(.) eq $my_name]"/>
451 <xsl:variable name="preceding-count" select="count($preceding)"/>
bansp9dc10002022-05-17 22:33:34 +0200452
bansp5f841732022-03-16 06:27:31 +0100453 <xsl:variable name="outside-preceding-count" as="xs:integer">
454 <xsl:choose>
455 <xsl:when test="self::tei:s or self::tei:p">
456 <xsl:choose>
457 <xsl:when test="$preceding-count">
458 <xsl:sequence select="
459 sum(for $p in $preceding
460 return
Piotr Banski4f4c2d22022-05-19 01:44:32 +0200461 count($p/descendant::*))"/> <!--mind @nkjp:rejected -->
bansp5f841732022-03-16 06:27:31 +0100462 </xsl:when>
463 <xsl:otherwise>
464 <xsl:sequence select="0"/>
465 </xsl:otherwise>
466 </xsl:choose>
467 </xsl:when>
468 <xsl:otherwise>
469 <xsl:sequence select="0"/>
470 </xsl:otherwise>
471 </xsl:choose>
472 </xsl:variable>
bansp9dc10002022-05-17 22:33:34 +0200473
bansp5f841732022-03-16 06:27:31 +0100474 <xsl:variable name="my_index" select="$index + 1 + $preceding-count + $outside-preceding-count"
475 as="xs:integer"/>
banspb5992532022-03-29 15:55:44 +0200476
477
478 <!--<xsl:copy select="//tei:seg[count(@nkjp:rejected) ne 0 and @nkjp:rejected ne 'true']"></xsl:copy>-->
bansp5f841732022-03-16 06:27:31 +0100479
480 <xsl:variable name="start" as="xs:integer">
481 <xsl:choose>
482 <xsl:when test="self::tei:text or self::tei:body">
483 <xsl:sequence select="0"/>
484 </xsl:when>
485 <xsl:when test="self::tei:p">
486 <xsl:variable name="first_corresp"
487 select="descendant::tei:s[1]/descendant::tei:seg[1]/attribute::corresp"
488 as="attribute(corresp)"/>
489 <xsl:variable name="numbers" select="substring-after(substring-before($first_corresp,')'),',')"/>
490 <xsl:sequence select="xs:integer(substring-before($numbers,','))"/>
491 </xsl:when>
492 <xsl:when test="self::tei:s">
493 <xsl:variable name="first_corresp"
494 select="descendant::tei:seg[1]/attribute::corresp"
495 as="attribute(corresp)"/>
496 <xsl:variable name="numbers" select="substring-after(substring-before($first_corresp,')'),',')"/>
497 <xsl:sequence select="xs:integer(substring-before($numbers,','))"/>
498 </xsl:when>
499 <xsl:when test="self::tei:seg">
500 <xsl:variable name="numbers" select="substring-after(substring-before(@corresp,')'),',')"/>
501 <xsl:sequence select="xs:integer(substring-before($numbers,','))"/>
502 </xsl:when>
Piotr Banski6a4a2522022-05-24 01:16:47 +0200503 <xsl:otherwise>
504 <xsl:message terminate="yes" select="'Element not handled: ' || fn:local-name()"/>
505 </xsl:otherwise>
bansp5f841732022-03-16 06:27:31 +0100506 </xsl:choose>
507 </xsl:variable>
Piotr Banski4f4c2d22022-05-19 01:44:32 +0200508
bansp5f841732022-03-16 06:27:31 +0100509 <xsl:variable name="end" as="xs:integer" select="f:calc_content_length(.)">
510 </xsl:variable>
bansp3e5b20c2022-03-18 20:22:31 +0100511
Piotr Banski6a4a2522022-05-24 01:16:47 +0200512
513
bansp5f841732022-03-16 06:27:31 +0100514 <xsl:element name="span" namespace="{$KorAP_namespace}">
515 <xsl:attribute name="id" select="'s' || $my_index"/>
516 <xsl:attribute name="from" select="$start"/>
517 <xsl:attribute name="to" select="$end"/>
Piotr Banski5fe4bae2022-05-24 02:40:13 +0200518 <xsl:attribute name="accumulator" select="string-join(map:get($offsets,string(@xml:id)),',')"/> <!--test-->
bansp5f841732022-03-16 06:27:31 +0100519 <xsl:attribute name="l" select="f:compute_nesting(.)"/>
520 <xsl:element name="fs" namespace="http://www.tei-c.org/ns/1.0">
bansp3e5b20c2022-03-18 20:22:31 +0100521 <xsl:attribute name="type" select="'struct'"></xsl:attribute> <!-- STRUCT vs. LEX -->
bansp5f841732022-03-16 06:27:31 +0100522 <xsl:element name="f" namespace="http://www.tei-c.org/ns/1.0">
bansp3e5b20c2022-03-18 20:22:31 +0100523 <xsl:attribute name="name" select="'name'"/>
524 <xsl:value-of select="local-name()"/>
bansp5f841732022-03-16 06:27:31 +0100525 </xsl:element>
Piotr Banski5fe4bae2022-05-24 02:40:13 +0200526 <xsl:if test="local-name() eq 'seg'"> <!--test-->
527 <xsl:element name="f" namespace="http://www.tei-c.org/ns/1.0">
528 <xsl:attribute name="name" select="'orth'"/>
529 <xsl:value-of select="fn:normalize-space(.)"/>
530 </xsl:element>
531 </xsl:if>
bansp5f841732022-03-16 06:27:31 +0100532 <xsl:if test="count(@*)">
533 <xsl:element name="f" namespace="http://www.tei-c.org/ns/1.0">
534 <xsl:attribute name="name" select="'attr'"/>
535 <xsl:element name="fs" namespace="http://www.tei-c.org/ns/1.0">
536 <xsl:attribute name="type" select="'attr'"/>
537 <xsl:for-each select="@*">
538 <xsl:element name="f" namespace="http://www.tei-c.org/ns/1.0">
539 <xsl:attribute name="name" select="local-name(.)"/>
540 <xsl:value-of select="."/>
541 </xsl:element>
542 </xsl:for-each>
543 </xsl:element>
544 </xsl:element>
545 </xsl:if>
546 </xsl:element>
547 </xsl:element>
548 <xsl:apply-templates mode="struct">
549 <xsl:with-param name="ini" select="$start" as="xs:integer"/>
550 <xsl:with-param name="fin" select="$end" as="xs:integer"/>
551 <xsl:with-param name="index" select="$my_index"/>
552 </xsl:apply-templates>
553 </xsl:template>
554
555 <!-- ************************** morpho ******************* -->
556
557 <xsl:template name="create_morpho">
558 <xsl:param name="text.xml" as="document-node()"/>
banspe726b4a2022-03-28 05:47:45 +0200559 <xsl:param name="compoundID" as="xs:string"/>
bansp3e5b20c2022-03-18 20:22:31 +0100560 <xsl:param name="ann_segmentation.xml" as="document-node()"/>
bansp5f841732022-03-16 06:27:31 +0100561 <xsl:param name="ann_morphosyntax.xml" as="document-node()"/>
562 <xsl:param name="target" as="xs:string"/>
563
564 <xsl:result-document encoding="UTF-8" method="xml" indent="yes"
565 xpath-default-namespace="http://ids-mannheim.de/ns/KorAP" href="{$target}">
566 <xsl:processing-instruction name="xml-model">href=&quot;span.rng&quot; type=&quot;application/xml&quot; schematypens=&quot;http://relaxng.org/ns/structure/1.0&quot;</xsl:processing-instruction>
bansp3e5b20c2022-03-18 20:22:31 +0100567 <xsl:element name="layer" namespace="{$KorAP_namespace}">
568 <xsl:attribute name="docid" select="$compoundID"/>
569 <xsl:attribute name="version" select="$KorAP-XML_version"/>
570
571 <xsl:element name="spanList" namespace="{$KorAP_namespace}">
572 <xsl:apply-templates select="$ann_segmentation.xml//tei:text" mode="morpho">
573 <xsl:with-param name="ann_morphosyntax.xml" select="$ann_morphosyntax.xml" as="document-node()"/>
574 </xsl:apply-templates>
575 </xsl:element>
576 </xsl:element>
bansp5f841732022-03-16 06:27:31 +0100577 </xsl:result-document>
578 </xsl:template>
579
bansp3e5b20c2022-03-18 20:22:31 +0100580 <xsl:template match="tei:*" mode="morpho">
581 <xsl:param name="ini" as="xs:integer" required="no" select="0"/>
582 <xsl:param name="fin" as="xs:integer" required="no" select="999999999"/>
583 <xsl:param name="index" as="xs:integer" required="no" select="1"/>
584 <xsl:param name="ann_morphosyntax.xml" as="document-node()"/>
585 <xsl:variable name="my_name" select="local-name()" as="xs:string"/>
586 <xsl:variable name="preceding" select="preceding-sibling::*[local-name(.) eq $my_name]"/>
587 <xsl:variable name="preceding-count" select="count($preceding)"/>
588 <xsl:variable name="outside-preceding-count" as="xs:integer">
589 <xsl:choose>
590 <xsl:when test="self::tei:s or self::tei:p">
591 <xsl:choose>
592 <xsl:when test="$preceding-count">
593 <xsl:sequence select="
594 sum(for $p in $preceding
595 return
596 count($p/descendant::*))"/>
597 </xsl:when>
598 <xsl:otherwise>
599 <xsl:sequence select="0"/>
600 </xsl:otherwise>
601 </xsl:choose>
602 </xsl:when>
603 <xsl:otherwise>
604 <xsl:sequence select="0"/>
605 </xsl:otherwise>
606 </xsl:choose>
607 </xsl:variable>
608 <xsl:variable name="my_index" select="$index + 1 + $preceding-count + $outside-preceding-count"
609 as="xs:integer"/>
610
611 <xsl:variable name="start" as="xs:integer">
612 <xsl:choose>
613 <xsl:when test="self::tei:text or self::tei:body">
614 <xsl:sequence select="0"/>
615 </xsl:when>
616 <xsl:when test="self::tei:p">
617 <xsl:variable name="first_corresp"
618 select="descendant::tei:s[1]/descendant::tei:seg[1]/attribute::corresp"
619 as="attribute(corresp)"/>
620 <xsl:variable name="numbers" select="substring-after(substring-before($first_corresp,')'),',')"/>
621 <xsl:sequence select="xs:integer(substring-before($numbers,','))"/>
622 </xsl:when>
623 <xsl:when test="self::tei:s">
624 <xsl:variable name="first_corresp"
625 select="descendant::tei:seg[1]/attribute::corresp"
626 as="attribute(corresp)"/>
627 <xsl:variable name="numbers" select="substring-after(substring-before($first_corresp,')'),',')"/>
628 <xsl:sequence select="xs:integer(substring-before($numbers,','))"/>
629 </xsl:when>
630 <!--<xsl:when test="self::tei:seg">
631 <xsl:variable name="numbers" select="substring-after(substring-before(@corresp,')'),',')"/>
632 <xsl:sequence select="xs:integer(substring-before($numbers,','))"/>
633 </xsl:when>-->
634 </xsl:choose>
635 </xsl:variable>
636 <xsl:variable name="end" as="xs:integer" select="f:calc_content_length(.)">
637 </xsl:variable>
638
639 <xsl:apply-templates mode="morpho">
640 <xsl:with-param name="ini" select="$start" as="xs:integer"/>
641 <xsl:with-param name="fin" select="$end" as="xs:integer"/>
642 <xsl:with-param name="index" select="$my_index"/>
643 <xsl:with-param name="ann_morphosyntax.xml" select="$ann_morphosyntax.xml" as="document-node()"/>
644 </xsl:apply-templates>
645 </xsl:template>
bansp5f841732022-03-16 06:27:31 +0100646
bansp3e5b20c2022-03-18 20:22:31 +0100647 <xsl:template match="tei:seg" mode="morpho">
648 <xsl:param name="ini" as="xs:integer" required="no" select="0"/>
649 <xsl:param name="fin" as="xs:integer" required="no" select="999999999"/>
650 <xsl:param name="index" as="xs:integer" required="no" select="1"/>
651 <xsl:param name="ann_morphosyntax.xml" as="document-node()"/>
652 <!-- I have made a major mess here, but it works... it's so spread out
653 because I wanted to make sure to be able to look up the individual
654 constituent values, should anything go wrong -->
655 <xsl:variable name="my_name" select="local-name()" as="xs:string"/>
656 <xsl:variable name="my_id" select="@xml:id" as="xs:string"/>
657 <xsl:variable name="my_morph-seg" as="node()" select="$ann_morphosyntax.xml//tei:seg[substring-after(@corresp,'#') eq $my_id]"/>
658 <xsl:variable name="my_disamb" select="$my_morph-seg//tei:fs/tei:f[@name eq 'disamb']" as="node()"/>
659 <xsl:variable name="my_choice-id" select="substring-after($my_disamb//tei:f[@name eq 'choice']/@fVal,'#')" as="xs:string"/>
660 <xsl:variable name="my_choice-lex" select="$my_morph-seg//tei:f[@name eq 'interps']/tei:fs[@type eq 'lex'][descendant::tei:symbol[@xml:id eq $my_choice-id]]" as="node()"/>
661 <xsl:variable name="chosen-msd" as="xs:string" select="$my_choice-lex/descendant::tei:symbol[@xml:id eq $my_choice-id]/@value"/>
662 <xsl:variable name="preceding" select="preceding-sibling::*[local-name(.) eq $my_name]"/>
663 <xsl:variable name="preceding-count" select="count($preceding)"/>
banspe726b4a2022-03-28 05:47:45 +0200664 <!--<xsl:variable name="outside-preceding-count" as="xs:integer">
bansp3e5b20c2022-03-18 20:22:31 +0100665 <xsl:choose>
banspe726b4a2022-03-28 05:47:45 +0200666 <xsl:when test="self::tei:s or self::tei:p"> <!-\- THIS NEEDS TO BE REVISITED AFTER THIS TEMPLATE HAS BECOME MORE SPECIFIC -\->
bansp3e5b20c2022-03-18 20:22:31 +0100667 <xsl:choose>
banspe726b4a2022-03-28 05:47:45 +0200668 <xsl:when test="$preceding-count"> commented out for now
bansp3e5b20c2022-03-18 20:22:31 +0100669 <xsl:sequence select="
670 sum(for $p in $preceding
671 return
672 count($p/descendant::*))"/>
673 </xsl:when>
674 <xsl:otherwise>
675 <xsl:sequence select="0"/>
676 </xsl:otherwise>
677 </xsl:choose>
678 </xsl:when>
679 <xsl:otherwise>
680 <xsl:sequence select="0"/>
681 </xsl:otherwise>
682 </xsl:choose>
banspe726b4a2022-03-28 05:47:45 +0200683 </xsl:variable>-->
684 <xsl:variable name="my_index" select="$index + 1 + $preceding-count" as="xs:integer"/>
bansp3e5b20c2022-03-18 20:22:31 +0100685
686 <xsl:variable name="start" as="xs:integer">
687 <xsl:variable name="numbers" select="substring-after(substring-before(@corresp,')'),',')"/>
688 <xsl:sequence select="xs:integer(substring-before($numbers,','))"/>
689 </xsl:variable>
690 <xsl:variable name="end" as="xs:integer" select="f:calc_content_length(.)">
691 </xsl:variable>
692 <xsl:element name="span" namespace="{$KorAP_namespace}">
693 <xsl:attribute name="id" select="'s' || $my_index"/>
694 <xsl:attribute name="from" select="$start"/>
695 <xsl:attribute name="to" select="$end"/>
696 <xsl:attribute name="l" select="f:compute_nesting(.)"/>
697 <xsl:element name="fs" namespace="http://www.tei-c.org/ns/1.0">
698 <xsl:attribute name="type" select="'lex'"/>
699 <xsl:element name="f" namespace="http://www.tei-c.org/ns/1.0">
700 <xsl:attribute name="name" select="'lex'"/>
701 <xsl:element name="fs" namespace="http://www.tei-c.org/ns/1.0">
702 <xsl:comment select="$my_morph-seg//tei:fs/tei:f[@name eq 'orth']/tei:string"/>
703
704
705 <xsl:element name="f" namespace="http://www.tei-c.org/ns/1.0">
706 <xsl:attribute name="name" select="'lemma'"/>
707 <xsl:value-of select="$my_choice-lex/tei:f[@name eq 'base']/tei:string"/>
708 </xsl:element>
709 <xsl:element name="f" namespace="http://www.tei-c.org/ns/1.0">
710 <xsl:attribute name="name" select="'pos'"/>
711 <xsl:value-of select="$my_choice-lex/tei:f[@name eq 'ctag']/tei:symbol/@value"/>
712 </xsl:element>
713 <xsl:if test="string-length($chosen-msd)">
714 <xsl:element name="f" namespace="http://www.tei-c.org/ns/1.0">
715 <xsl:attribute name="name" select="'msd'"/>
716 <xsl:value-of select="$chosen-msd"/>
717 </xsl:element>
718 </xsl:if>
719 <xsl:if test="$my_morph-seg//tei:fs/tei:f[@name eq 'nps']">
720 <xsl:element name="f" namespace="http://www.tei-c.org/ns/1.0">
721 <xsl:attribute name="name" select="'join'"/>
722 <xsl:value-of select="'left'"/>
723 </xsl:element>
724 </xsl:if>
725 </xsl:element>
726 </xsl:element>
727 </xsl:element>
728 </xsl:element>
banspe726b4a2022-03-28 05:47:45 +0200729 <xsl:apply-templates mode="morpho">
bansp3e5b20c2022-03-18 20:22:31 +0100730 <xsl:with-param name="ini" select="$start" as="xs:integer"/>
731 <xsl:with-param name="fin" select="$end" as="xs:integer"/>
732 <xsl:with-param name="index" select="$my_index"/>
733 <xsl:with-param name="ann_morphosyntax.xml" select="$ann_morphosyntax.xml" as="document-node()"/>
banspe726b4a2022-03-28 05:47:45 +0200734 </xsl:apply-templates>-->
bansp3e5b20c2022-03-18 20:22:31 +0100735 </xsl:template>
banspe726b4a2022-03-28 05:47:45 +0200736
bansp5f841732022-03-16 06:27:31 +0100737 <!-- ************************** TEXT header ******************* -->
738
739 <xsl:template name="create_text_header">
740 <xsl:param name="text.xml" as="document-node()"/>
banspe726b4a2022-03-28 05:47:45 +0200741 <xsl:param name="compoundID" as="xs:string"/>
bansp5f841732022-03-16 06:27:31 +0100742 <xsl:param name="target" as="xs:string"/>
743
744 <!-- create the local header.xml file -->
745 <xsl:result-document encoding="UTF-8" method="xml" indent="yes"
746 xpath-default-namespace="http://ids-mannheim.de/ns/KorAP" href="{$target}">
747
748 <idsHeader type="document" pattern="text" status="new" version="1.1" TEIform="teiHeader">
banspe726b4a2022-03-28 05:47:45 +0200749 <xsl:apply-templates select="$text.xml//tei:TEI/tei:teiHeader/tei:*" mode="text">
750 <xsl:with-param name="compoundID" as="xs:string" select="$compoundID" tunnel="yes"/>
751 </xsl:apply-templates>
bansp5f841732022-03-16 06:27:31 +0100752 </idsHeader>
753 </xsl:result-document>
754 </xsl:template>
755
756 <xsl:template match="tei:fileDesc" mode="text">
bansp9103aab2022-03-19 05:10:21 +0100757 <xsl:element name="{local-name()}">
bansp5f841732022-03-16 06:27:31 +0100758 <xsl:apply-templates mode="text"/>
bansp9103aab2022-03-19 05:10:21 +0100759 </xsl:element>
bansp5f841732022-03-16 06:27:31 +0100760 </xsl:template>
761
762 <xsl:template match="tei:title" mode="text">
763 <t.title>
764 <xsl:apply-templates/>
765 </t.title>
766 </xsl:template>
767
768 <xsl:template match="tei:titleStmt" mode="text">
banspe726b4a2022-03-28 05:47:45 +0200769 <xsl:param name="compoundID" as="xs:string" tunnel="yes"/>
bansp5f841732022-03-16 06:27:31 +0100770 <titleStmt>
771 <textSigle>
banspe726b4a2022-03-28 05:47:45 +0200772 <xsl:value-of select="$compoundID"/>
bansp5f841732022-03-16 06:27:31 +0100773 </textSigle>
774 <xsl:apply-templates mode="text"/>
775 </titleStmt>
776 </xsl:template>
777
bansp9103aab2022-03-19 05:10:21 +0100778 <xsl:template match="tei:publicationStmt" mode="text">
779 <xsl:element name="{local-name()}">
780 <xsl:apply-templates mode="text"/>
781 </xsl:element>
782 </xsl:template>
783
784 <xsl:template match="tei:availability" mode="text">
785 <xsl:element name="{local-name()}">
786 <xsl:apply-templates mode="text" select="@* | *"/>
787 </xsl:element>
788 </xsl:template>
789
790 <xsl:template match="tei:profileDesc" mode="text">
791 <xsl:element name="{local-name()}">
792 <xsl:apply-templates mode="text"/>
793 </xsl:element>
794 </xsl:template>
bansp5f841732022-03-16 06:27:31 +0100795
bansp9103aab2022-03-19 05:10:21 +0100796 <xsl:template match="tei:textClass" mode="text">
797 <xsl:element name="{local-name()}">
798 <xsl:apply-templates mode="text" select="@* | *"/>
799 </xsl:element>
800 </xsl:template>
801
802 <xsl:template match="tei:catRef" mode="text corpus">
803 <xsl:element name="{local-name()}">
804 <xsl:apply-templates mode="text" select="@* | *"/>
805 </xsl:element>
806 </xsl:template>
807
808 <xsl:template match="@status | @scheme | @target | @type | @xml:id[ancestor::tei:classDecl] | @xml:lang" mode="text corpus">
809 <xsl:copy-of select="."/>
810 </xsl:template>
811
812 <xsl:template match="tei:p" mode="text corpus">
813 <xsl:element name="{local-name()}">
814 <xsl:apply-templates mode="header-text"/>
815 </xsl:element>
816 </xsl:template>
817
818
819 <!-- OPTIMIZATION has to take modes into account -->
bansp5e2d1c02022-03-10 04:51:40 +0100820 <!-- ************************** CORPUS header ******************* -->
821 <xsl:template name="create_corpus_header">
822 <xsl:param name="text.xml" as="document-node()"/>
bansp5f841732022-03-16 06:27:31 +0100823 <xsl:param name="target" as="xs:string"/>
bansp5e2d1c02022-03-10 04:51:40 +0100824
825 <!-- create the corpus-level header.xml file -->
bansp5f841732022-03-16 06:27:31 +0100826 <xsl:result-document encoding="UTF-8" method="xml" indent="yes" href="{$target}">
bansp5e2d1c02022-03-10 04:51:40 +0100827
828 <!--doctype-public="{$publicDoctypeI5}"
829 doctype-system="{$systemDoctypeI5}">
830 these are, sadly, useless
831 -->
832
833 <idsHeader type="corpus" pattern="text" status="new" version="1.1" TEIform="teiHeader">
bansp9103aab2022-03-19 05:10:21 +0100834 <xsl:apply-templates select="$text.xml/tei:teiCorpus/tei:teiHeader/tei:*" mode="corpus"/>
bansp5e2d1c02022-03-10 04:51:40 +0100835 </idsHeader>
836 </xsl:result-document>
837 </xsl:template>
838
839 <xsl:template match="tei:fileDesc" mode="corpus">
bansp9103aab2022-03-19 05:10:21 +0100840 <xsl:element name="{local-name()}">
bansp5e2d1c02022-03-10 04:51:40 +0100841 <xsl:apply-templates mode="corpus"/>
bansp9103aab2022-03-19 05:10:21 +0100842 </xsl:element>
bansp5e2d1c02022-03-10 04:51:40 +0100843 </xsl:template>
bansp9103aab2022-03-19 05:10:21 +0100844
bansp5e2d1c02022-03-10 04:51:40 +0100845
846 <xsl:template match="tei:title" mode="corpus">
847 <c.title>
bansp9103aab2022-03-19 05:10:21 +0100848 <xsl:apply-templates mode="corpus" select="@*"/>
849 <xsl:apply-templates mode="header-text"/>
bansp5e2d1c02022-03-10 04:51:40 +0100850 </c.title>
851 </xsl:template>
852
853 <xsl:template match="tei:titleStmt" mode="corpus">
854 <titleStmt>
855 <korpusSigle>
856 <xsl:value-of select="$corpusID"/>
857 </korpusSigle>
858 <xsl:apply-templates mode="corpus"/>
859 </titleStmt>
860 </xsl:template>
861
bansp9103aab2022-03-19 05:10:21 +0100862 <xsl:template match="tei:publicationStmt" mode="corpus">
863 <xsl:element name="{local-name()}">
864 <xsl:apply-templates mode="corpus"/>
865 </xsl:element>
866 </xsl:template>
867
868 <xsl:template match="tei:availability" mode="corpus">
869 <xsl:element name="{local-name()}">
870 <xsl:apply-templates mode="corpus" select="@* | *"/>
871 </xsl:element>
872 </xsl:template>
873
874 <xsl:template match="tei:encodingDesc" mode="corpus">
875 <xsl:element name="{local-name()}">
876 <xsl:apply-templates mode="corpus"/>
877 </xsl:element>
878 </xsl:template>
879
880 <xsl:template match="tei:classDecl | tei:taxonomy | tei:category | tei:taxonomy/tei:bibl" mode="corpus">
881 <xsl:element name="{local-name()}">
882 <xsl:apply-templates mode="corpus" select="@* | *"/>
883 </xsl:element>
884 </xsl:template>
885
886 <xsl:template match="tei:bibl/tei:title | tei:edition | tei:desc" mode="corpus">
887 <xsl:element name="{local-name()}">
888 <xsl:apply-templates mode="corpus" select="@*"/>
889 <xsl:apply-templates mode="header-text"/>
890 </xsl:element>
891 </xsl:template>
892<!--
893 <xsl:template match="tei:textClass" mode="corpus">
894 <xsl:element name="{local-name()}">
895 <xsl:apply-templates mode="corpus" select="@* | *"/>
896 </xsl:element>
897 </xsl:template>
898
899 <xsl:template match="tei:catRef" mode="corpus">
900 <xsl:element name="{local-name()}">
901 <xsl:apply-templates mode="corpus" select="@* | *"/>
902 </xsl:element>
903 </xsl:template>
904-->
bansp5e2d1c02022-03-10 04:51:40 +0100905
906
907
908 <!-- this template can be called by the XSPEC test; TODO: find a way to call the main() template directly -->
909 <!-- I have not fully handled the param transmission, which would have to be kludged in just for the sake of XSPec,
910 because I'm disabling this for now, due to XSpec design issues; relevant links, a.o.:
911
912 https://stackoverflow.com/questions/64933277/what-is-the-cause-of-error-cannot-execute-xslresult-document-while-evaluating
913 https://www.balisage.net/Proceedings/vol25/html/Galtman01/BalisageVol25-Galtman01.html
914
915 In short: the internal design of XSpec forces kludges when one wants to use xsl:result-document in their stylesheets. But I don't
916 want to be strangled by kludges at the beginning of work, I've already lost quite a bit of time on this investigation,
917 I will therefore "just code" and then can think of externalizing bits of templates if we want to play with tests. For now,
918 I don't want to have to handle context items is a special way inside variables, etc., because I'm not sure it's worth it.
919
920 -->
921 <!--<xsl:template name="test_full">
922 <xsl:param name="corpusID"/>
923 <xsl:param name="docID"/>
924 <xsl:param name="textID"/>
925 <xsl:call-template name="xsl:initial-template"/>
926 </xsl:template>-->
927
Piotr Banski4f4c2d22022-05-19 01:44:32 +0200928 <xsl:function name="f:calc_content_length" as="xs:integer">
929 <xsl:param name="node" as="node()"/>
930 <xsl:choose>
931 <xsl:when test="$node/self::tei:text or $node/self::tei:body">
932 <xsl:variable name="last_corresp"
933 select="$node/descendant::tei:p[last()]/descendant::tei:s[last()]/descendant::tei:seg[count(@nkjp:rejected) eq 0 or @nkjp:rejected ne 'true'][last()]/attribute::corresp"
934 as="attribute(corresp)"/>
935 <xsl:variable name="numbers" select="substring-after(substring-before($last_corresp,')'),',')"/>
936 <xsl:sequence select="xs:integer(substring-before($numbers,',')) + xs:integer(substring-after($numbers,','))"/>
937 </xsl:when>
938 <xsl:when test="$node/self::tei:p">
939 <xsl:variable name="last_corresp"
940 select="$node/descendant::tei:s[last()]/descendant::tei:seg[count(@nkjp:rejected) eq 0 or @nkjp:rejected ne 'true'][last()]/attribute::corresp"
941 as="attribute(corresp)"/>
942 <xsl:variable name="numbers" select="substring-after(substring-before($last_corresp,')'),',')"/>
943 <xsl:sequence select="xs:integer(substring-before($numbers,',')) + xs:integer(substring-after($numbers,','))"/>
944 </xsl:when>
945 <xsl:when test="$node/self::tei:s">
946 <xsl:variable name="last_corresp"
947 select="$node/descendant::tei:seg[count(@nkjp:rejected) eq 0 or @nkjp:rejected ne 'true'][last()]/attribute::corresp"
948 as="attribute(corresp)"/>
949 <xsl:variable name="numbers" select="substring-after(substring-before($last_corresp,')'),',')"/>
950 <xsl:sequence select="xs:integer(substring-before($numbers,',')) + xs:integer(substring-after($numbers,','))"/>
951 </xsl:when>
952 <xsl:otherwise>
953 <xsl:variable name="numbers" select="substring-after(substring-before($node/@corresp,')'),',')"/>
954 <xsl:if test="$node/self::tei:seg and count($node/@nkjp:rejected)">
955 <!-- REMOVE THIS -->
956 <xsl:message select="$numbers"/>
957 </xsl:if>
958 <xsl:sequence select="xs:integer(substring-before($numbers,',')) + xs:integer(substring-after($numbers,','))"/>
959 </xsl:otherwise>
960 </xsl:choose>
961 </xsl:function>
962
963 <xsl:function name="f:calc_offsets" as="xs:integer+">
964 <xsl:param name="node" as="element()"/>
965 <xsl:param name="skip_start" as="xs:boolean" />
966
967 <xsl:variable name="start" as="xs:integer">
968 <xsl:choose>
969
970 <xsl:when test="$skip_start or $node/self::tei:text or $node/self::tei:body">
971 <xsl:sequence select="0"/>
972 </xsl:when>
973
974 <!-- handle p -->
975
976 <xsl:when test="$node/self::tei:p">
977 <xsl:variable name="my_pos" as="xs:integer" select="count($node/preceding-sibling::tei:p) + 1"/>
978 <xsl:variable name="preceding" as="node()*"
979 select="$node/ancestor::tei:body/tei:p[position() lt $my_pos]"/>
980
981 <xsl:choose>
982 <xsl:when test="count($preceding) eq 0">
983 <xsl:sequence select="0"/>
984 </xsl:when>
985 <xsl:otherwise>
986 <xsl:sequence select="sum(f:calc_offsets($preceding[last()],true())[$OFFSET_END],1)"/>
987
988 <!-- BUG danger: I am not sure if a "1" should rather be added after each p; let me try to handle that in the return value of the $length variable,
989 and make it sensitive to the skip_start parameter
990
991 I will then have to remove the ",1" from here!
992
993 -->
994
995 <!-- <xsl:variable name="last_corresps"
996 select="$preceding/descendant::tei:s[last()]/(descendant::tei:seg[count(@nkjp:rejected) eq 0 or @nkjp:rejected ne 'true'])[last()]/@corresp"
997 as="attribute(corresp)+"/>
998 <xsl:variable name="end_offsets" as="xs:integer+">
999 <xsl:for-each select="$last_corresps">
1000 <xsl:variable name="numbers"
1001 select="substring-after(substring-before(., ')'), ',')"/>
1002 <xsl:sequence
1003 select="xs:integer(substring-before($numbers, ',')) + xs:integer(substring-after($numbers, ','))"
1004 />
1005 </xsl:for-each>
1006 </xsl:variable>
1007 <xsl:sequence select="sum($end_offsets, 1)"/>
1008
1009 this is a non-recursive variant that may turn out to be much less cpu-intensive, not sure
1010 - but if it's plugged in, it will have to be adjusted to the current form of the recursive variant,
1011 because it hasn't been maintained since it got commented out
1012 -->
1013 </xsl:otherwise>
1014 </xsl:choose>
1015 </xsl:when>
1016
1017 <!-- handle s -->
1018
1019 <!-- the value for s gets counted since the start of the current p
1020 - so we look at the preceding s's
1021 + the preceding p's
1022 -->
1023 <xsl:when test="$node/self::tei:s">
1024 <!--<xsl:variable name="last_corresp"
1025 select="$node/descendant::tei:seg[count(@nkjp:rejected) eq 0 or @nkjp:rejected ne 'true'][last()]/attribute::corresp"
1026 as="attribute(corresp)"/>
1027 <xsl:variable name="numbers"
1028 select="substring-after(substring-before($last_corresp, ')'), ',')"/>
1029 <xsl:sequence
1030 select="xs:integer(substring-before($numbers, ',')) + xs:integer(substring-after($numbers, ','))"
1031 />
1032 -->
1033
1034 <xsl:variable name="internal_start" as="xs:integer">
1035 <xsl:variable name="my_pos" as="xs:integer" select="count($node/preceding-sibling::tei:s) + 1"/>
1036 <xsl:variable name="preceding" as="node()*"
1037 select="$node/ancestor::tei:p[1]/tei:s[position() lt $my_pos]"/>
1038
1039 <xsl:choose>
1040 <xsl:when test="count($preceding) eq 0">
1041 <xsl:sequence select="0"/>
1042 </xsl:when>
1043 <xsl:otherwise>
1044 <xsl:sequence select="f:calc_offsets($preceding[last()],true())[$OFFSET_END]"/>
1045 <!--<xsl:sequence select="sum(f:calc_offsets($preceding[last()],true())[$OFFSET_END],1)"/>-->
1046 <!-- again, CAREFUL ABOUT THE +1, it might need to vanish -->
1047 </xsl:otherwise>
1048 </xsl:choose>
1049 </xsl:variable>
1050
1051 <xsl:variable name="external_start" as="xs:integer" select="f:calc_offsets($node/ancestor::tei:p[1],false())[$OFFSET_START]"/>
1052
1053 <xsl:sequence select="$internal_start + $external_start"/>
1054 </xsl:when>
1055
1056 <!-- handle seg -->
1057
1058 <xsl:when test="$node/self::tei:seg">
1059 <!-- for segs, the s elements are irrelevant, and the local offset is immediately available on the @corresp -->
1060
1061 <xsl:variable name="numbers"
1062 select="substring-after(substring-before($node/@corresp, ')'), ',')"/>
1063
1064 <xsl:variable name="internal_start" select="xs:integer(substring-before($numbers, ','))"
1065 as="xs:integer"/>
1066 <xsl:variable name="external_start" as="xs:integer"
1067 select="f:calc_offsets($node/ancestor::tei:p[1], false())[$OFFSET_START]"/>
1068
1069 <xsl:if test="$node/self::tei:seg and count($node/@nkjp:rejected)">
1070
1071 <xsl:message select="'numbers: ' || $numbers"/>
1072 </xsl:if>
1073 <xsl:sequence select="$internal_start + $external_start"/>
1074 </xsl:when>
1075 </xsl:choose>
1076 </xsl:variable>
1077
1078 <xsl:variable name="length" as="xs:integer">
1079 <xsl:choose>
1080
1081 <xsl:when test="$node/self::tei:text or $node/self::tei:body">
1082 <xsl:variable name="last_corresps"
1083 select="$node/descendant::tei:p/descendant::tei:s[last()]/(descendant::tei:seg[count(@nkjp:rejected) eq 0 or @nkjp:rejected ne 'true'])[last()]/@corresp"
1084 as="attribute(corresp)+"/>
1085
1086 <xsl:variable name="end_offsets" as="xs:integer+">
1087 <xsl:for-each select="$last_corresps">
1088 <xsl:variable name="numbers" select="substring-after(substring-before(., ')'), ',')"/>
1089 <xsl:sequence
1090 select="xs:integer(substring-before($numbers, ',')) + xs:integer(substring-after($numbers, ','))"
1091 />
1092 </xsl:for-each>
1093 </xsl:variable>
1094
1095 <xsl:sequence select="sum($end_offsets)"/>
1096
1097 </xsl:when>
1098 <xsl:when test="$node/self::tei:p">
1099 <xsl:variable name="last_corresps"
1100 select="$node/descendant::tei:s[last()]/(descendant::tei:seg[count(@nkjp:rejected) eq 0 or @nkjp:rejected ne 'true'])[last()]/@corresp"
1101 as="attribute(corresp)+"/>
1102 <xsl:variable name="end_offsets" as="xs:integer+">
1103 <xsl:for-each select="$last_corresps">
1104 <xsl:variable name="numbers" select="substring-after(substring-before(., ')'), ',')"/>
1105 <xsl:sequence
1106 select="xs:integer(substring-before($numbers, ',')) + xs:integer(substring-after($numbers, ','))"
1107 />
1108 </xsl:for-each>
1109 </xsl:variable>
1110 <xsl:sequence select="sum($end_offsets)"/>
1111 </xsl:when>
1112
1113
1114
1115
1116 <xsl:when test="$node/self::tei:s">
1117 <xsl:variable name="last_corresp"
1118 select="$node/descendant::tei:seg[count(@nkjp:rejected) eq 0 or @nkjp:rejected ne 'true'][last()]/attribute::corresp"
1119 as="attribute(corresp)"/>
1120 <xsl:variable name="numbers"
1121 select="substring-after(substring-before($last_corresp, ')'), ',')"/>
1122 <xsl:sequence
1123 select="xs:integer(substring-before($numbers, ',')) + xs:integer(substring-after($numbers, ','))"
1124 />
1125 </xsl:when>
1126 <xsl:otherwise>
1127 <xsl:variable name="numbers"
1128 select="substring-after(substring-before($node/@corresp, ')'), ',')"/>
1129 <xsl:if test="$node/self::tei:seg and count($node/@nkjp:rejected)">
1130 <!-- REMOVE THIS -->
1131 <xsl:message select="'rejected: ' || $numbers"/>
1132 </xsl:if>
1133 <xsl:sequence
1134 select="xs:integer(substring-before($numbers, ',')) + xs:integer(substring-after($numbers, ','))"
1135 />
1136 </xsl:otherwise>
1137 </xsl:choose>
1138 </xsl:variable>
1139
1140 <xsl:message select="local-name($node) || '[' || count($node/preceding-sibling::*[local-name() eq local-name($node)])+1 || '] length: ' || $length || ' skip_start: ' || $skip_start"/>
1141
1142 <xsl:sequence select="$start, $start + $length -1 + xs:integer($skip_start)"/>
1143 </xsl:function>
1144
1145
Akron9a8ee3e2022-01-31 13:51:49 +01001146</xsl:stylesheet>
Piotr Banski6a4a2522022-05-24 01:16:47 +02001147
1148<!--<xsl:message select="('map:',serialize($map, map{'method':'adaptive'}))"/>-->