blob: e220a051db59b7099acf177159f8ed903a23504a [file] [log] [blame]
Akron9a8ee3e2022-01-31 13:51:49 +01001<?xml version="1.0" encoding="UTF-8"?>
2<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
bansp5e2d1c02022-03-10 04:51:40 +01003 xmlns:xs="http://www.w3.org/2001/XMLSchema" xmlns:nkjp="http://www.nkjp.pl/ns/1.0"
4 xmlns:tei="http://www.tei-c.org/ns/1.0" xmlns:f="func"
Piotr Banskif8af3a92022-05-23 03:20:10 +02005 xmlns:fn="http://www.w3.org/2005/xpath-functions"
Piotr Banski6a4a2522022-05-24 01:16:47 +02006 xmlns:map="http://www.w3.org/2005/xpath-functions/map" exclude-result-prefixes="xs f fn map nkjp tei"
bansp5e2d1c02022-03-10 04:51:40 +01007 version="3.0" expand-text="yes">
Akron9a8ee3e2022-01-31 13:51:49 +01008
banspe726b4a2022-03-28 05:47:45 +02009
10<!-- PARAMETERS -->
bansp5e2d1c02022-03-10 04:51:40 +010011
bansp8f6700b2022-03-27 05:27:09 +020012 <xsl:param name="sourceDir" select="'test/resources/nkjp2korap_sample2'" as="xs:string"/>
banspd1bf1db2022-04-04 02:16:24 +020013 <!-- the directory containing NKJP files, in the form of a collection of text-level dirs
14 (that is how we know both the $corpusID and the $docID) -->
Akron9a8ee3e2022-01-31 13:51:49 +010015
bansp8f6700b2022-03-27 05:27:09 +020016 <xsl:param name="targetDir" select="'test/output'" as="xs:string"/>
banspd1bf1db2022-04-04 02:16:24 +020017 <!-- where the corpus/document/text/annotations hierarchy is going to be created -->
banspf2b24e62022-03-28 18:12:08 +020018
19 <xsl:param name="skip_docID" as="xs:string">
Piotr Banskic5950ce2022-05-27 15:07:08 +020020 <!--<xsl:value-of select="''"/>-->
21 <xsl:value-of select="'HellerPodgladanie,IsakowiczZaleskiMoje,KolakowskiOco,MysliwskiKamien,WilkWilczy,ZycieWarszawy_Zycie'"/>
banspb5992532022-03-29 15:55:44 +020022 </xsl:param>
23 <!-- comma-separated list of document IDs to be skipped from processing
banspf2b24e62022-03-28 18:12:08 +020024 example: HellerPodgladanie,KOT
banspd1bf1db2022-04-04 02:16:24 +020025 no functionality beyond string identity is supported
26 (this is just for testing) -->
banspb5992532022-03-29 15:55:44 +020027
Piotr Banski1ae16bd2022-05-25 15:59:40 +020028 <xsl:param name="SHOW_ORTH_IN_STRUCT" as="xs:boolean" select="true()"/>
Piotr Banski09096ee2022-05-25 13:41:03 +020029 <!-- for debugging structure.xml production -->
30
bansp8f6700b2022-03-27 05:27:09 +020031
bansp9dc10002022-05-17 22:33:34 +020032<!-- VARIABLES (= constants...) -->
banspe726b4a2022-03-28 05:47:45 +020033
34 <xsl:variable name="corpusID" as="xs:string" select="'NKJP'" static="yes"/>
35 <xsl:variable name="docID" as="xs:string" select="'NKJP'" static="yes"/>
bansp8f6700b2022-03-27 05:27:09 +020036
37 <xsl:variable name="targetCorpusDir_slashed" select="$targetDir || '/' || $corpusID || '/'" as="xs:string"/>
bansp5e2d1c02022-03-10 04:51:40 +010038
banspd1bf1db2022-04-04 02:16:24 +020039 <xsl:variable name="systemDoctypeI5" as="xs:string"
40 select="'http://corpora.ids-mannheim.de/I5/DTD/i5.dtd'" static="true"/>
bansp5e2d1c02022-03-10 04:51:40 +010041
banspd1bf1db2022-04-04 02:16:24 +020042 <xsl:variable name="publicDoctypeI5" as="xs:string" static="true"
43 select="'-//IDS//DTD I5 1.0//EN'"/>
bansp5e2d1c02022-03-10 04:51:40 +010044
banspd1bf1db2022-04-04 02:16:24 +020045 <xsl:variable name="KorAP_namespace" static="true" as="xs:string"
46 select="'http://ids-mannheim.de/ns/KorAP'"/>
bansp5e2d1c02022-03-10 04:51:40 +010047
bansp5f841732022-03-16 06:27:31 +010048 <xsl:variable name="KorAP-XML_version" select="'KorAP-0.4'" as="xs:string" static="true"/>
49 <!-- this is only a bit funny -->
50
banspe726b4a2022-03-28 05:47:45 +020051 <xsl:variable name="collection_params" as="xs:string" static="yes"
52 select="'recurse=yes;validation=strip;select=text.xml;content-type=application/xml;on-error=warning;xinclude=yes'"
53 />
54 <!-- see https://www.saxonica.com/documentation11/index.html#!sourcedocs/collections/collection-directories -->
55
56 <xsl:variable name="collection_of_text" select="collection($sourceDir || '?' || $collection_params)" as="document-node()+"/>
banspd1bf1db2022-04-04 02:16:24 +020057
banspe726b4a2022-03-28 05:47:45 +020058<!-- MODES -->
bansp5e2d1c02022-03-10 04:51:40 +010059
60 <xsl:mode name="corpus" on-no-match="deep-skip"/>
61 <xsl:mode name="text" on-no-match="deep-skip"/>
bansp9103aab2022-03-19 05:10:21 +010062 <xsl:mode name="header-text" on-no-match="text-only-copy"/>
Piotr Banski6a4a2522022-05-24 01:16:47 +020063 <xsl:mode use-accumulators="#all"/>
Piotr Banski09096ee2022-05-25 13:41:03 +020064
Piotr Banskie1ac5202022-05-30 21:25:21 +020065 <xsl:accumulator name="element-index" as="xs:integer" initial-value="0">
66 <xsl:accumulator-rule match="tei:*[ancestor-or-self::tei:text]" select="$value + 1" phase="start"/>
Piotr Banski09096ee2022-05-25 13:41:03 +020067 </xsl:accumulator>
Piotr Banskifdc858a2022-05-25 02:40:32 +020068
Piotr Banski65a6d0b2022-05-31 17:23:08 +020069
70
71 <!--I think I may be able to actually merge the two accumulators, but let's see-->
72 <xsl:accumulator name="morpho-offsets" as="map(xs:string, item()+)+" initial-value="(map{'null':(0,0)})">
Piotr Banskifdc858a2022-05-25 02:40:32 +020073
74 <xsl:accumulator-rule match="tei:body/tei:p" phase="start">
75 <xsl:variable name="preceding_index" as="xs:integer">
76 <xsl:variable name="the_tail" as="map(*)" select="head(reverse($value))"/>
77 <xsl:sequence select="map:get($the_tail,map:keys($the_tail)[1])[2]"/>
Piotr Banski6a4a2522022-05-24 01:16:47 +020078 </xsl:variable>
79
Piotr Banskifdc858a2022-05-25 02:40:32 +020080 <xsl:variable name="our_base" as="xs:integer" select="if($preceding_index eq 0) then $preceding_index else $preceding_index + 1"/>
81 <!-- for paragraphs, it's in either being initial or not -->
Piotr Banski09096ee2022-05-25 13:41:03 +020082
Piotr Banskifdc858a2022-05-25 02:40:32 +020083 <xsl:sequence select="
84 $value,
85 map {
86 string(@xml:id): ($preceding_index,$our_base)
87 }"/>
88 </xsl:accumulator-rule>
Piotr Banski65a6d0b2022-05-31 17:23:08 +020089 <!-- this is morpho-offsets -->
Piotr Banskifdc858a2022-05-25 02:40:32 +020090 <xsl:accumulator-rule match="tei:s" phase="start">
91 <xsl:variable name="preceding_index" as="xs:integer">
92 <xsl:variable name="the_tail" as="map(*)" select="head(reverse($value))"/>
93 <xsl:sequence select="map:get($the_tail,map:keys($the_tail)[1])[2]"/>
Piotr Banskifdc858a2022-05-25 02:40:32 +020094 </xsl:variable>
Piotr Banski92791a22022-05-26 01:41:10 +020095 <xsl:variable name="our_base" as="xs:integer" select="if($preceding_index eq 0) then $preceding_index else $preceding_index + xs:integer(f:is_preceded_by_ws(.,true()))"/>
Piotr Banski09096ee2022-05-25 13:41:03 +020096
Piotr Banskifdc858a2022-05-25 02:40:32 +020097 <xsl:sequence select="
98 $value,
99 map {
100 string(@xml:id): ($preceding_index,$our_base)
101 }"/>
102 </xsl:accumulator-rule>
Piotr Banski65a6d0b2022-05-31 17:23:08 +0200103 <!-- this is morpho-offsets -->
104
105<!-- I want something that won't be matched in other layers, for efficiency - that
106 may allow me to merge the accumulators, eventually;
107 but I also want to filter out the rejected tokenization alternatives already here -->
108 <xsl:accumulator-rule match="tei:seg[tei:fs[@type eq 'morph' and tei:f[@name eq 'disamb']]]" phase="end">
109
Piotr Banskifdc858a2022-05-25 02:40:32 +0200110 <xsl:variable name="preceding_index" as="xs:integer">
111 <xsl:variable name="the_tail" as="map(*)" select="head(reverse($value))"/>
112 <xsl:sequence select="map:get($the_tail,map:keys($the_tail)[1])[2]"/>
Piotr Banskifdc858a2022-05-25 02:40:32 +0200113 </xsl:variable>
Piotr Banski65a6d0b2022-05-31 17:23:08 +0200114 <xsl:variable name="our_base" as="xs:integer" select="$preceding_index + xs:integer(f:is_preceded_by_ws(.,true()))"/>
Piotr Banski09096ee2022-05-25 13:41:03 +0200115
Piotr Banski6a4a2522022-05-24 01:16:47 +0200116 <xsl:sequence select="
Piotr Banski65a6d0b2022-05-31 17:23:08 +0200117 $value,
118 map {
119 string(@xml:id): ($our_base,$our_base + string-length(tei:fs/tei:f[@name eq 'orth']/tei:string))
120 }"/>
121
Piotr Banski6a4a2522022-05-24 01:16:47 +0200122 </xsl:accumulator-rule>
Piotr Banski65a6d0b2022-05-31 17:23:08 +0200123 <!-- this is morpho-offsets -->
Piotr Banskifdc858a2022-05-25 02:40:32 +0200124 <xsl:accumulator-rule match="tei:s" phase="end">
125 <xsl:variable name="preceding_index" as="xs:integer">
126 <xsl:variable name="the_tail" as="map(*)" select="head(reverse($value))"/>
127 <xsl:sequence select="map:get($the_tail,map:keys($the_tail)[1])[2]"/>
Piotr Banski09096ee2022-05-25 13:41:03 +0200128
Piotr Banskifdc858a2022-05-25 02:40:32 +0200129 </xsl:variable>
130 <xsl:variable name="our_base" as="xs:integer">
131 <xsl:variable name="incomplete" select="map:find($value,string(@xml:id))(1)" as="xs:integer+"/>
132 <xsl:sequence select="$incomplete[2]"/>
133 </xsl:variable>
Piotr Banski09096ee2022-05-25 13:41:03 +0200134
Piotr Banskifdc858a2022-05-25 02:40:32 +0200135 <xsl:sequence select="
136 $value,
137 map {
138 string(@xml:id): ($our_base,$preceding_index)
139 }"/>
140 </xsl:accumulator-rule>
Piotr Banski65a6d0b2022-05-31 17:23:08 +0200141 <!-- this is morpho-offsets -->
Piotr Banskifdc858a2022-05-25 02:40:32 +0200142 <xsl:accumulator-rule match="tei:body/tei:p" phase="end">
143 <xsl:variable name="preceding_index" as="xs:integer">
144 <xsl:variable name="the_tail" as="map(*)" select="head(reverse($value))"/>
145 <xsl:sequence select="map:get($the_tail,map:keys($the_tail)[1])[2]"/>
146
Piotr Banskifdc858a2022-05-25 02:40:32 +0200147 </xsl:variable>
148 <xsl:variable name="our_base" as="xs:integer">
149 <xsl:variable name="incomplete" select="map:find($value,string(@xml:id))(1)" as="xs:integer+"/>
150 <xsl:sequence select="$incomplete[2]"/>
151 </xsl:variable>
Piotr Banski09096ee2022-05-25 13:41:03 +0200152
Piotr Banskifdc858a2022-05-25 02:40:32 +0200153 <xsl:sequence select="
154 $value,
155 map {
156 string(@xml:id): ($our_base,$preceding_index)
157 }"/>
158 </xsl:accumulator-rule>
Piotr Banski65a6d0b2022-05-31 17:23:08 +0200159 </xsl:accumulator>
160
161
162
163 <xsl:accumulator name="segmentation-offsets" as="map(xs:string, item()+)+" initial-value="(map{'null':(0,0)})">
164
165 <xsl:accumulator-rule match="tei:body/tei:p" phase="start">
166 <xsl:variable name="preceding_index" as="xs:integer">
167 <xsl:variable name="the_tail" as="map(*)" select="head(reverse($value))"/>
168 <xsl:sequence select="map:get($the_tail,map:keys($the_tail)[1])[2]"/>
169 </xsl:variable>
170
171 <xsl:variable name="our_base" as="xs:integer" select="if($preceding_index eq 0) then $preceding_index else $preceding_index + 1"/>
172 <!-- for paragraphs, it's in either being initial or not -->
173
174 <xsl:sequence select="
175 $value,
176 map {
177 string(@xml:id): ($preceding_index,$our_base)
178 }"/>
179 </xsl:accumulator-rule>
180
181 <xsl:accumulator-rule match="tei:s" phase="start">
182 <xsl:variable name="preceding_index" as="xs:integer">
183 <xsl:variable name="the_tail" as="map(*)" select="head(reverse($value))"/>
184 <xsl:sequence select="map:get($the_tail,map:keys($the_tail)[1])[2]"/>
185 </xsl:variable>
186 <xsl:variable name="our_base" as="xs:integer" select="if($preceding_index eq 0) then $preceding_index else $preceding_index + xs:integer(f:is_preceded_by_ws(.,true()))"/>
187
188 <xsl:sequence select="
189 $value,
190 map {
191 string(@xml:id): ($preceding_index,$our_base)
192 }"/>
193 </xsl:accumulator-rule>
194
195 <xsl:accumulator-rule match="tei:w[parent::tei:seg[count(@nkjp:rejected) eq 0]]" phase="end">
196 <xsl:variable name="preceding_index" as="xs:integer">
197 <xsl:variable name="the_tail" as="map(*)" select="head(reverse($value))"/>
198 <xsl:sequence select="map:get($the_tail,map:keys($the_tail)[1])[2]"/>
199 </xsl:variable>
200 <xsl:variable name="our_base" as="xs:integer" select="$preceding_index + xs:integer(f:is_preceded_by_ws(parent::tei:seg,true()))"/>
201
202 <xsl:sequence select="
203 $value,
204 map {
205 string(parent::tei:seg/@xml:id): ($our_base,$our_base + string-length())
206 }"/>
207 </xsl:accumulator-rule>
208
209 <xsl:accumulator-rule match="tei:s" phase="end">
210 <xsl:variable name="preceding_index" as="xs:integer">
211 <xsl:variable name="the_tail" as="map(*)" select="head(reverse($value))"/>
212 <xsl:sequence select="map:get($the_tail,map:keys($the_tail)[1])[2]"/>
213
214 </xsl:variable>
215 <xsl:variable name="our_base" as="xs:integer">
216 <xsl:variable name="incomplete" select="map:find($value,string(@xml:id))(1)" as="xs:integer+"/>
217 <xsl:sequence select="$incomplete[2]"/>
218 </xsl:variable>
219
220 <xsl:sequence select="
221 $value,
222 map {
223 string(@xml:id): ($our_base,$preceding_index)
224 }"/>
225 </xsl:accumulator-rule>
226
227 <xsl:accumulator-rule match="tei:body/tei:p" phase="end">
228 <xsl:variable name="preceding_index" as="xs:integer">
229 <xsl:variable name="the_tail" as="map(*)" select="head(reverse($value))"/>
230 <xsl:sequence select="map:get($the_tail,map:keys($the_tail)[1])[2]"/>
231
232 </xsl:variable>
233 <xsl:variable name="our_base" as="xs:integer">
234 <xsl:variable name="incomplete" select="map:find($value,string(@xml:id))(1)" as="xs:integer+"/>
235 <xsl:sequence select="$incomplete[2]"/>
236 </xsl:variable>
237
238 <xsl:sequence select="
239 $value,
240 map {
241 string(@xml:id): ($our_base,$preceding_index)
242 }"/>
243 </xsl:accumulator-rule>
Piotr Banskifdc858a2022-05-25 02:40:32 +0200244
245 <xsl:accumulator-rule match="tei:body" phase="end">
246 <xsl:variable name="preceding_index" as="xs:integer">
247 <xsl:variable name="the_tail" as="map(*)" select="head(reverse($value))"/>
248 <xsl:sequence select="map:get($the_tail, map:keys($the_tail)[1])[2]"/>
249 </xsl:variable>
Piotr Banski65a6d0b2022-05-31 17:23:08 +0200250
Piotr Banskifdc858a2022-05-25 02:40:32 +0200251 <xsl:sequence select="
Piotr Banski65a6d0b2022-05-31 17:23:08 +0200252 $value,
253 map {
254 string(@xml:id): (0, $preceding_index)
255 }"/>
Piotr Banskifdc858a2022-05-25 02:40:32 +0200256 </xsl:accumulator-rule>
257
258 <xsl:accumulator-rule match="tei:text" phase="end">
259 <xsl:variable name="preceding_index" as="xs:integer">
260 <xsl:variable name="the_tail" as="map(*)" select="head(reverse($value))"/>
261 <xsl:sequence select="map:get($the_tail, map:keys($the_tail)[1])[2]"/>
262 </xsl:variable>
263
264 <xsl:sequence select="
265 $value,
266 map {
267 string(@xml:id): (0, $preceding_index)
268 }"/>
269 </xsl:accumulator-rule>
Piotr Banski6a4a2522022-05-24 01:16:47 +0200270 </xsl:accumulator>
bansp5e2d1c02022-03-10 04:51:40 +0100271
banspe726b4a2022-03-28 05:47:45 +0200272 <!-- FUNCTIONS -->
273
bansp5f841732022-03-16 06:27:31 +0100274 <xsl:function name="f:compute_nesting" as="xs:integer">
banspd1bf1db2022-04-04 02:16:24 +0200275 <xsl:param name="node" as="element()"/>
bansp5f841732022-03-16 06:27:31 +0100276 <xsl:variable name="rel_depth"
277 select="count($node/ancestor-or-self::*[local-name(.) ne 'TEI'][local-name(.) ne 'teiCorpus'])"
278 as="xs:integer"/>
bansp5f841732022-03-16 06:27:31 +0100279 <xsl:sequence select="$rel_depth"/>
280 </xsl:function>
281
Piotr Banski92791a22022-05-26 01:41:10 +0200282 <xsl:function name="f:is_preceded_by_ws" as="xs:boolean">
bansp9dc10002022-05-17 22:33:34 +0200283 <xsl:param name="node" as="element()"/>
Piotr Banskifdc858a2022-05-25 02:40:32 +0200284 <xsl:param name="suppress_initial" as="xs:boolean"/>
Piotr Banski4f4c2d22022-05-19 01:44:32 +0200285
Piotr Banski92791a22022-05-26 01:41:10 +0200286 <xsl:choose>
287 <xsl:when test="local-name($node) eq 'seg'">
288 <xsl:choose>
289 <xsl:when test="$node/@nkjp:nps">
290 <xsl:sequence select="fn:false()"/>
291 </xsl:when>
Piotr Banski65a6d0b2022-05-31 17:23:08 +0200292 <xsl:when test="$node/tei:fs/tei:f[@name eq 'nps']">
293 <!--added for traversing ann_morphosyntax-->
294 <xsl:sequence select="fn:false()"/>
295 </xsl:when>
Piotr Banski92791a22022-05-26 01:41:10 +0200296 <xsl:when
297 test="$node/ancestor::tei:s[count(preceding-sibling::tei:s) eq 0] and $node/ancestor::tei:p[count(preceding-sibling::tei:p) eq 0] and not($node/preceding::tei:seg[count(@nkjp:rejected) eq 0])">
298 <xsl:sequence select="fn:false()"/>
299 <!-- the otherwise very costly check for preceding segs fires only if the first two are true, so it will have minimal search space -->
300 </xsl:when>
301 <xsl:when
302 test="$suppress_initial and $node/ancestor::tei:s/descendant::tei:seg[count(@nkjp:rejected) eq 0][1]/@xml:id eq $node/@xml:id">
303 <!-- I forget how node identity works now, so let me just compare the IDs -->
304 <xsl:sequence select="fn:false()"/>
305 </xsl:when>
Piotr Banski65a6d0b2022-05-31 17:23:08 +0200306 <xsl:when
307 test="$suppress_initial and $node/ancestor::tei:s/descendant::tei:seg[tei:fs/tei:f[@name eq 'disamb']][1]/@xml:id eq $node/@xml:id">
308 <!--added for traversing ann_morphosyntax-->
309 <xsl:sequence select="fn:false()"/>
310 </xsl:when>
Piotr Banski92791a22022-05-26 01:41:10 +0200311 <xsl:otherwise>
312 <xsl:sequence select="fn:true()"/>
313 </xsl:otherwise>
314 </xsl:choose>
315 </xsl:when>
316 <xsl:when test="local-name($node) eq 's'">
317 <xsl:choose>
318 <xsl:when test="exists($node/preceding-sibling::tei:s)">
319 <xsl:sequence select="fn:true()"/>
320 </xsl:when>
321 <xsl:otherwise>
322 <xsl:sequence
323 select="not($suppress_initial) and exists($node/ancestor::tei:p[1]/preceding-sibling::tei:p)"
324 />
325 </xsl:otherwise>
326 </xsl:choose>
327
328 </xsl:when>
329 <xsl:when test="local-name($node) eq 'p'">
330 <xsl:sequence select="exists($node/preceding-sibling::tei:p)"/>
331 </xsl:when>
332 <xsl:otherwise>
333 <xsl:message terminate="yes"
334 select="'Wrong argument passed to f:is_preceded_by_ws(): ' || local-name($node) || ' Only p, s, seg are allowed.'"
335 />
336 </xsl:otherwise>
337 </xsl:choose>
338 </xsl:function>
banspd1bf1db2022-04-04 02:16:24 +0200339
banspb5992532022-03-29 15:55:44 +0200340<!-- UTILITY TEMPLATES -->
341
bansp9103aab2022-03-19 05:10:21 +0100342 <xsl:template match="@default" mode="#all"/>
bansp97ba7ce2022-03-26 05:14:06 +0100343 <!-- this is to delete some auto-inserted attribute throughout -->
bansp9103aab2022-03-19 05:10:21 +0100344
Piotr Banski6a4a2522022-05-24 01:16:47 +0200345 <xsl:template match="tei:w" mode="#all"/>
banspe726b4a2022-03-28 05:47:45 +0200346<!-- NKJP-SGJP has apparently resigned from standoff representations by adding <w> everywhere;
Piotr Banskifdc858a2022-05-25 02:40:32 +0200347 we reach for them, but from the level of <seg>, so we don't need to process <w> separately -->
bansp8f6700b2022-03-27 05:27:09 +0200348
Piotr Banski09096ee2022-05-25 13:41:03 +0200349 <!-- fall-thru, skipping the potential <paren> element and filtering out the bad guys -->
Piotr Banski6a4a2522022-05-24 01:16:47 +0200350 <xsl:template match="tei:choice" mode="struct">
Piotr Banski09096ee2022-05-25 13:41:03 +0200351 <xsl:apply-templates select="descendant::tei:seg[count(@nkjp:rejected) eq 0]" mode="struct"/>
Piotr Banski6a4a2522022-05-24 01:16:47 +0200352 </xsl:template>
Piotr Banskia51907c2022-05-25 15:09:41 +0200353 <xsl:template match="tei:choice" mode="morpho">
354 <xsl:apply-templates select="descendant::tei:seg[count(@nkjp:rejected) eq 0]" mode="morpho"/>
355 </xsl:template>
banspb5992532022-03-29 15:55:44 +0200356
357 <!-- MAIN PROCESSING -->
358
bansp5e2d1c02022-03-10 04:51:40 +0100359 <xsl:template name="xsl:initial-template">
banspf2b24e62022-03-28 18:12:08 +0200360 <xsl:variable name="IDs_to_skip" select="tokenize($skip_docID,',')" as="xs:string*"/>
banspd1bf1db2022-04-04 02:16:24 +0200361
banspe726b4a2022-03-28 05:47:45 +0200362 <!-- we only want to call the template below once, and we process a random NKJP corpus file for that purpose,
bansp8f6700b2022-03-27 05:27:09 +0200363 because all we need is the main corpus header, and we can (should) get to that from any NKJP corpus document -->
364 <xsl:call-template name="create_corpus_header">
banspe726b4a2022-03-28 05:47:45 +0200365 <xsl:with-param name="text.xml" select="$collection_of_text[1]" as="document-node()"/>
bansp8f6700b2022-03-27 05:27:09 +0200366 <xsl:with-param name="target" select="$targetCorpusDir_slashed || 'header.xml'" as="xs:string"/>
367 </xsl:call-template>
368
banspe726b4a2022-03-28 05:47:45 +0200369 <xsl:for-each select="$collection_of_text">
370 <xsl:variable name="my_dir" as="xs:string" select="replace(base-uri(),'/text\.xml','')"/>
371 <xsl:variable name="my_textID" as="xs:string" select="tokenize($my_dir,'/')[last()]"/>
372 <xsl:variable name="ann_morphosyntax.uri" select="$my_dir || '/ann_morphosyntax.xml'" as="xs:string"/>
373 <xsl:variable name="ann_segmentation.uri" select="$my_dir || '/ann_segmentation.xml'" as="xs:string"/>
Piotr Banskic5950ce2022-05-27 15:07:08 +0200374 <xsl:variable name="ann_named.uri" select="$my_dir || '/ann_named.xml'" as="xs:string"/>
375 <xsl:variable name="ann_groups.uri" select="$my_dir || '/ann_groups.xml'" as="xs:string"/>
376 <xsl:variable name="ann_words.uri" select="$my_dir || '/ann_words.xml'" as="xs:string"/>
banspe726b4a2022-03-28 05:47:45 +0200377
banspf2b24e62022-03-28 18:12:08 +0200378 <xsl:choose>
379 <xsl:when test="$my_textID = $IDs_to_skip"/>
bansp9dc10002022-05-17 22:33:34 +0200380 <!-- this is a utility step, for when we want to ignore some texts for any reason (debugging, selective update) -->
banspf2b24e62022-03-28 18:12:08 +0200381 <xsl:otherwise>
banspd1bf1db2022-04-04 02:16:24 +0200382
bansp9dc10002022-05-17 22:33:34 +0200383 <xsl:call-template name="process_single_sample">
banspf2b24e62022-03-28 18:12:08 +0200384 <xsl:with-param name="text.xml" as="document-node()" select="."/>
385 <xsl:with-param name="ann_morphosyntax.xml" as="document-node()"
386 select="doc($ann_morphosyntax.uri)"/>
387 <xsl:with-param name="ann_segmentation.xml" as="document-node()"
388 select="doc($ann_segmentation.uri)"/>
389 <xsl:with-param name="my_textID" select="$my_textID" as="xs:string"/>
Piotr Banskic5950ce2022-05-27 15:07:08 +0200390 <!-- the following parameters may happen to be null -->
391 <xsl:with-param name="ann_named.xml" as="document-node()*"
392 select="if(fn:doc-available($ann_named.uri)) then doc($ann_named.uri) else ()"/>
393 <xsl:with-param name="ann_groups.xml" as="document-node()*"
394 select="if(fn:doc-available($ann_groups.uri)) then doc($ann_groups.uri) else ()"/>
395 <xsl:with-param name="ann_words.xml" as="document-node()*"
396 select="if(fn:doc-available($ann_words.uri)) then doc($ann_words.uri) else ()"/>
397
bansp9dc10002022-05-17 22:33:34 +0200398 </xsl:call-template>
banspf2b24e62022-03-28 18:12:08 +0200399 </xsl:otherwise>
400 </xsl:choose>
banspe726b4a2022-03-28 05:47:45 +0200401 </xsl:for-each>
bansp8f6700b2022-03-27 05:27:09 +0200402 </xsl:template>
403
404 <xsl:template name="process_single_sample">
banspe726b4a2022-03-28 05:47:45 +0200405 <xsl:param name="text.xml" as="document-node()"/>
406 <xsl:param name="ann_morphosyntax.xml" as="document-node()"/>
407 <xsl:param name="ann_segmentation.xml" as="document-node()"/>
banspd1bf1db2022-04-04 02:16:24 +0200408 <xsl:param name="my_textID" as="xs:string" select="'0-BAD_textID'"/>
bansp9dc10002022-05-17 22:33:34 +0200409 <!-- empty textID should never happen, but if it does, it will be signalled at the top of the output -->
Piotr Banskic5950ce2022-05-27 15:07:08 +0200410 <xsl:param name="ann_named.xml" as="document-node()*"/>
411 <xsl:param name="ann_groups.xml" as="document-node()*"/>
412 <xsl:param name="ann_words.xml" as="document-node()*"/>
banspe726b4a2022-03-28 05:47:45 +0200413
414 <xsl:variable name="targetBaseDir" as="xs:string" select="$targetCorpusDir_slashed || $docID || '/' || $my_textID"/>
415
416 <xsl:variable name="compoundID" as="xs:string"
417 select="$corpusID || '_' || $docID || '.' || $my_textID"/>
418 <!-- this is what occurs in the text and data layers as @docid -->
419
bansp5e2d1c02022-03-10 04:51:40 +0100420 <xsl:call-template name="create_data">
bansp9dc10002022-05-17 22:33:34 +0200421 <xsl:with-param name="ann_segmentation.xml" select="$ann_segmentation.xml" as="document-node()"/>
banspe726b4a2022-03-28 05:47:45 +0200422 <xsl:with-param name="compoundID" select="$compoundID" as="xs:string"/>
423 <xsl:with-param name="target" select="$targetBaseDir || '/data.xml'" as="xs:string"/>
bansp5f841732022-03-16 06:27:31 +0100424 </xsl:call-template>
425
Piotr Banski4f4c2d22022-05-19 01:44:32 +0200426 <xsl:call-template name="create_struct">
banspe726b4a2022-03-28 05:47:45 +0200427 <xsl:with-param name="compoundID" select="$compoundID" as="xs:string"/>
bansp5f841732022-03-16 06:27:31 +0100428 <xsl:with-param name="ann_segmentation.xml" select="$ann_segmentation.xml"
429 as="document-node()"/>
banspe726b4a2022-03-28 05:47:45 +0200430 <xsl:with-param name="target" select="$targetBaseDir || '/struct/structure.xml'" as="xs:string"
bansp5f841732022-03-16 06:27:31 +0100431 />
432 </xsl:call-template>
Piotr Banski92791a22022-05-26 01:41:10 +0200433
Piotr Banskia51907c2022-05-25 15:09:41 +0200434 <xsl:call-template name="create_morpho">
banspe726b4a2022-03-28 05:47:45 +0200435 <xsl:with-param name="compoundID" select="$compoundID" as="xs:string"/>
bansp3e5b20c2022-03-18 20:22:31 +0100436 <xsl:with-param name="ann_segmentation.xml" select="$ann_segmentation.xml"
437 as="document-node()"/>
bansp5f841732022-03-16 06:27:31 +0100438 <xsl:with-param name="ann_morphosyntax.xml" select="$ann_morphosyntax.xml"
439 as="document-node()"/>
banspe726b4a2022-03-28 05:47:45 +0200440 <xsl:with-param name="target" select="$targetBaseDir || '/nkjp/morpho.xml'" as="xs:string"/>
bansp5e2d1c02022-03-10 04:51:40 +0100441 </xsl:call-template>
Piotr Banskia51907c2022-05-25 15:09:41 +0200442
Piotr Banski09096ee2022-05-25 13:41:03 +0200443 <xsl:call-template name="create_text_header">
bansp5e2d1c02022-03-10 04:51:40 +0100444 <xsl:with-param name="text.xml" select="$text.xml" as="document-node()"/>
banspe726b4a2022-03-28 05:47:45 +0200445 <xsl:with-param name="compoundID" select="$compoundID" as="xs:string"/>
446 <xsl:with-param name="target" select="$targetBaseDir || '/header.xml'" as="xs:string"/>
Piotr Banski09096ee2022-05-25 13:41:03 +0200447 </xsl:call-template>
Piotr Banski6a4a2522022-05-24 01:16:47 +0200448
Piotr Banskic5950ce2022-05-27 15:07:08 +0200449 <xsl:if test="$ann_named.xml">
450 <xsl:call-template name="create_named">
451 <xsl:with-param name="compoundID" select="$compoundID" as="xs:string"/>
452 <xsl:with-param name="ann_segmentation.xml" select="$ann_segmentation.xml"
453 as="document-node()"/>
454 <xsl:with-param name="ann_morphosyntax.xml" select="$ann_morphosyntax.xml"
455 as="document-node()"/>
456 <xsl:with-param name="ann_named.xml" select="$ann_named.xml"
457 as="document-node()"/>
458 <xsl:with-param name="target" select="$targetBaseDir || '/nkjp/named.xml'" as="xs:string"/>
459 </xsl:call-template>
460 </xsl:if>
461
462 <xsl:if test="$ann_words.xml and $ann_groups.xml">
463 <xsl:call-template name="create_groups">
464 <xsl:with-param name="compoundID" select="$compoundID" as="xs:string"/>
465 <xsl:with-param name="ann_segmentation.xml" select="$ann_segmentation.xml"
466 as="document-node()"/>
467 <xsl:with-param name="ann_morphosyntax.xml" select="$ann_morphosyntax.xml"
468 as="document-node()"/>
469 <xsl:with-param name="ann_words.xml" select="$ann_words.xml"
470 as="document-node()"/>
471 <xsl:with-param name="ann_groups.xml" select="$ann_groups.xml"
472 as="document-node()"/>
473 <xsl:with-param name="target" select="$targetBaseDir || '/nkjp/groups.xml'" as="xs:string"/>
474 </xsl:call-template>
475 </xsl:if>
476
bansp5e2d1c02022-03-10 04:51:40 +0100477 </xsl:template>
478
479 <!-- ************************** data.xml ******************* -->
480
481 <xsl:template name="create_data">
bansp9dc10002022-05-17 22:33:34 +0200482 <xsl:param name="ann_segmentation.xml" as="document-node()"/>
banspe726b4a2022-03-28 05:47:45 +0200483 <xsl:param name="compoundID" as="xs:string"/>
bansp5f841732022-03-16 06:27:31 +0100484 <xsl:param name="target" as="xs:string"/>
bansp5e2d1c02022-03-10 04:51:40 +0100485 <!-- create the data.xml file -->
486 <xsl:result-document encoding="UTF-8" method="xml" indent="yes"
bansp5f841732022-03-16 06:27:31 +0100487 xpath-default-namespace="{$KorAP_namespace}" href="{$target}">
bansp5e2d1c02022-03-10 04:51:40 +0100488
Akron9a8ee3e2022-01-31 13:51:49 +0100489 <xsl:processing-instruction name="xml-model">href=&quot;text.rng&quot; type=&quot;application/xml&quot; schematypens=&quot;http://relaxng.org/ns/structure/1.0&quot;</xsl:processing-instruction>
bansp5e2d1c02022-03-10 04:51:40 +0100490 <xsl:element name="raw_text" namespace="{$KorAP_namespace}">
bansp5f841732022-03-16 06:27:31 +0100491 <xsl:attribute name="docid" select="$compoundID"/>
bansp5e2d1c02022-03-10 04:51:40 +0100492 <xsl:element name="metadata" namespace="{$KorAP_namespace}">
493 <xsl:attribute name="file" select="'metadata.xml'"/>
494 </xsl:element>
495
496 <xsl:element name="text" namespace="{$KorAP_namespace}">
bansp9dc10002022-05-17 22:33:34 +0200497 <xsl:variable name="content" as="xs:string+">
Piotr Banski4f4c2d22022-05-19 01:44:32 +0200498 <xsl:for-each select="$ann_segmentation.xml/tei:teiCorpus/tei:TEI/tei:text/tei:body/tei:p/tei:s//tei:seg[count(@nkjp:rejected) eq 0]">
bansp9dc10002022-05-17 22:33:34 +0200499 <xsl:sequence select="
Piotr Banskifdc858a2022-05-25 02:40:32 +0200500 if (f:is_preceded_by_ws(.,false())) then
bansp9dc10002022-05-17 22:33:34 +0200501 ' '
502 else
503 '', ./tei:w"/>
504 </xsl:for-each>
505 </xsl:variable>
506 <xsl:value-of select="string-join($content)"/>
bansp5e2d1c02022-03-10 04:51:40 +0100507 </xsl:element>
Akron9a8ee3e2022-01-31 13:51:49 +0100508 </xsl:element>
banspf79443e2022-02-25 14:25:33 +0100509 </xsl:result-document>
Akron9a8ee3e2022-01-31 13:51:49 +0100510 </xsl:template>
511
bansp5f841732022-03-16 06:27:31 +0100512 <!-- ************************** struct ******************* -->
513
514 <xsl:template name="create_struct">
banspe726b4a2022-03-28 05:47:45 +0200515 <xsl:param name="compoundID" as="xs:string"/>
bansp5f841732022-03-16 06:27:31 +0100516 <xsl:param name="ann_segmentation.xml" as="document-node()"/>
517 <xsl:param name="target" as="xs:string"/>
Piotr Banski4f4c2d22022-05-19 01:44:32 +0200518
bansp5f841732022-03-16 06:27:31 +0100519 <xsl:result-document encoding="UTF-8" method="xml" indent="yes"
520 xpath-default-namespace="http://ids-mannheim.de/ns/KorAP" href="{$target}">
521 <xsl:processing-instruction name="xml-model">href=&quot;span.rng&quot; type=&quot;application/xml&quot; schematypens=&quot;http://relaxng.org/ns/structure/1.0&quot;</xsl:processing-instruction>
522 <xsl:element name="layer" namespace="{$KorAP_namespace}">
523 <xsl:attribute name="docid" select="$compoundID"/>
524 <xsl:attribute name="version" select="$KorAP-XML_version"/>
525
526 <xsl:element name="spanList" namespace="{$KorAP_namespace}">
Piotr Banski09096ee2022-05-25 13:41:03 +0200527 <xsl:apply-templates select="$ann_segmentation.xml//tei:text" mode="struct"/>
bansp5f841732022-03-16 06:27:31 +0100528 </xsl:element>
529 </xsl:element>
530 </xsl:result-document>
531 </xsl:template>
532
533 <xsl:template match="tei:*" mode="struct">
Piotr Banski09096ee2022-05-25 13:41:03 +0200534 <xsl:variable name="offsets" as="xs:integer+">
Piotr Banski65a6d0b2022-05-31 17:23:08 +0200535 <xsl:sequence select="map:get(fn:accumulator-after('segmentation-offsets')[last()], string(@xml:id))"/>
Piotr Banski09096ee2022-05-25 13:41:03 +0200536 </xsl:variable>
Piotr Banski6a4a2522022-05-24 01:16:47 +0200537
bansp5f841732022-03-16 06:27:31 +0100538 <xsl:variable name="my_name" select="local-name()" as="xs:string"/>
Piotr Banskie1ac5202022-05-30 21:25:21 +0200539 <xsl:variable name="my_index" select="fn:accumulator-before('element-index')" as="xs:integer"/>
bansp3e5b20c2022-03-18 20:22:31 +0100540
bansp5f841732022-03-16 06:27:31 +0100541 <xsl:element name="span" namespace="{$KorAP_namespace}">
542 <xsl:attribute name="id" select="'s' || $my_index"/>
Piotr Banski09096ee2022-05-25 13:41:03 +0200543 <xsl:attribute name="from" select="$offsets[1]"/>
544 <xsl:attribute name="to" select="$offsets[2]"/>
bansp5f841732022-03-16 06:27:31 +0100545 <xsl:attribute name="l" select="f:compute_nesting(.)"/>
Piotr Banski09096ee2022-05-25 13:41:03 +0200546 <xsl:if test="local-name() eq 'seg' and $SHOW_ORTH_IN_STRUCT">
547 <xsl:comment><xsl:value-of select="fn:normalize-space(.)"/></xsl:comment>
548 </xsl:if>
bansp5f841732022-03-16 06:27:31 +0100549 <xsl:element name="fs" namespace="http://www.tei-c.org/ns/1.0">
Piotr Banskifdc858a2022-05-25 02:40:32 +0200550 <xsl:attribute name="type" select="'struct'"></xsl:attribute> <!-- STRUCT vs. LEX for morpho -->
bansp5f841732022-03-16 06:27:31 +0100551 <xsl:element name="f" namespace="http://www.tei-c.org/ns/1.0">
bansp3e5b20c2022-03-18 20:22:31 +0100552 <xsl:attribute name="name" select="'name'"/>
553 <xsl:value-of select="local-name()"/>
bansp5f841732022-03-16 06:27:31 +0100554 </xsl:element>
555 <xsl:if test="count(@*)">
556 <xsl:element name="f" namespace="http://www.tei-c.org/ns/1.0">
557 <xsl:attribute name="name" select="'attr'"/>
558 <xsl:element name="fs" namespace="http://www.tei-c.org/ns/1.0">
559 <xsl:attribute name="type" select="'attr'"/>
560 <xsl:for-each select="@*">
561 <xsl:element name="f" namespace="http://www.tei-c.org/ns/1.0">
562 <xsl:attribute name="name" select="local-name(.)"/>
563 <xsl:value-of select="."/>
564 </xsl:element>
565 </xsl:for-each>
566 </xsl:element>
567 </xsl:element>
568 </xsl:if>
569 </xsl:element>
570 </xsl:element>
Piotr Banskia51907c2022-05-25 15:09:41 +0200571 <xsl:apply-templates mode="struct"/>
bansp5f841732022-03-16 06:27:31 +0100572 </xsl:template>
573
574 <!-- ************************** morpho ******************* -->
575
576 <xsl:template name="create_morpho">
banspe726b4a2022-03-28 05:47:45 +0200577 <xsl:param name="compoundID" as="xs:string"/>
bansp3e5b20c2022-03-18 20:22:31 +0100578 <xsl:param name="ann_segmentation.xml" as="document-node()"/>
bansp5f841732022-03-16 06:27:31 +0100579 <xsl:param name="ann_morphosyntax.xml" as="document-node()"/>
580 <xsl:param name="target" as="xs:string"/>
581
582 <xsl:result-document encoding="UTF-8" method="xml" indent="yes"
583 xpath-default-namespace="http://ids-mannheim.de/ns/KorAP" href="{$target}">
584 <xsl:processing-instruction name="xml-model">href=&quot;span.rng&quot; type=&quot;application/xml&quot; schematypens=&quot;http://relaxng.org/ns/structure/1.0&quot;</xsl:processing-instruction>
bansp3e5b20c2022-03-18 20:22:31 +0100585 <xsl:element name="layer" namespace="{$KorAP_namespace}">
586 <xsl:attribute name="docid" select="$compoundID"/>
587 <xsl:attribute name="version" select="$KorAP-XML_version"/>
588
589 <xsl:element name="spanList" namespace="{$KorAP_namespace}">
590 <xsl:apply-templates select="$ann_segmentation.xml//tei:text" mode="morpho">
Piotr Banskia51907c2022-05-25 15:09:41 +0200591 <xsl:with-param name="ann_morphosyntax.xml" select="$ann_morphosyntax.xml" as="document-node()" tunnel="yes"/>
bansp3e5b20c2022-03-18 20:22:31 +0100592 </xsl:apply-templates>
593 </xsl:element>
594 </xsl:element>
bansp5f841732022-03-16 06:27:31 +0100595 </xsl:result-document>
596 </xsl:template>
597
bansp3e5b20c2022-03-18 20:22:31 +0100598 <xsl:template match="tei:seg" mode="morpho">
Piotr Banskia51907c2022-05-25 15:09:41 +0200599 <xsl:param name="ann_morphosyntax.xml" as="document-node()" tunnel="yes"/>
600 <!-- it's so spread out because I wanted to make sure to be able to look up the individual
601 constituent values, should anything go wrong; it might get compacted at some point, but
602 the increase in efficiency will probably be minimal, compared to the decrease of readability -->
603 <xsl:variable name="offsets" as="xs:integer+">
Piotr Banski65a6d0b2022-05-31 17:23:08 +0200604 <xsl:sequence select="map:get(fn:accumulator-after('segmentation-offsets')[last()], string(@xml:id))"/>
Piotr Banskia51907c2022-05-25 15:09:41 +0200605 </xsl:variable>
bansp3e5b20c2022-03-18 20:22:31 +0100606 <xsl:variable name="my_name" select="local-name()" as="xs:string"/>
607 <xsl:variable name="my_id" select="@xml:id" as="xs:string"/>
608 <xsl:variable name="my_morph-seg" as="node()" select="$ann_morphosyntax.xml//tei:seg[substring-after(@corresp,'#') eq $my_id]"/>
609 <xsl:variable name="my_disamb" select="$my_morph-seg//tei:fs/tei:f[@name eq 'disamb']" as="node()"/>
610 <xsl:variable name="my_choice-id" select="substring-after($my_disamb//tei:f[@name eq 'choice']/@fVal,'#')" as="xs:string"/>
611 <xsl:variable name="my_choice-lex" select="$my_morph-seg//tei:f[@name eq 'interps']/tei:fs[@type eq 'lex'][descendant::tei:symbol[@xml:id eq $my_choice-id]]" as="node()"/>
612 <xsl:variable name="chosen-msd" as="xs:string" select="$my_choice-lex/descendant::tei:symbol[@xml:id eq $my_choice-id]/@value"/>
Piotr Banskie1ac5202022-05-30 21:25:21 +0200613 <xsl:variable name="my_index" select="fn:accumulator-before('element-index')" as="xs:integer"/>
bansp3e5b20c2022-03-18 20:22:31 +0100614
bansp3e5b20c2022-03-18 20:22:31 +0100615 <xsl:element name="span" namespace="{$KorAP_namespace}">
Piotr Banskia51907c2022-05-25 15:09:41 +0200616 <xsl:attribute name="id" select="'m' || $my_index"/>
617 <xsl:attribute name="from" select="$offsets[1]"/>
618 <xsl:attribute name="to" select="$offsets[2]"/>
bansp3e5b20c2022-03-18 20:22:31 +0100619 <xsl:attribute name="l" select="f:compute_nesting(.)"/>
620 <xsl:element name="fs" namespace="http://www.tei-c.org/ns/1.0">
621 <xsl:attribute name="type" select="'lex'"/>
622 <xsl:element name="f" namespace="http://www.tei-c.org/ns/1.0">
623 <xsl:attribute name="name" select="'lex'"/>
624 <xsl:element name="fs" namespace="http://www.tei-c.org/ns/1.0">
625 <xsl:comment select="$my_morph-seg//tei:fs/tei:f[@name eq 'orth']/tei:string"/>
Piotr Banskia51907c2022-05-25 15:09:41 +0200626
bansp3e5b20c2022-03-18 20:22:31 +0100627 <xsl:element name="f" namespace="http://www.tei-c.org/ns/1.0">
628 <xsl:attribute name="name" select="'lemma'"/>
629 <xsl:value-of select="$my_choice-lex/tei:f[@name eq 'base']/tei:string"/>
630 </xsl:element>
631 <xsl:element name="f" namespace="http://www.tei-c.org/ns/1.0">
632 <xsl:attribute name="name" select="'pos'"/>
633 <xsl:value-of select="$my_choice-lex/tei:f[@name eq 'ctag']/tei:symbol/@value"/>
634 </xsl:element>
635 <xsl:if test="string-length($chosen-msd)">
636 <xsl:element name="f" namespace="http://www.tei-c.org/ns/1.0">
637 <xsl:attribute name="name" select="'msd'"/>
638 <xsl:value-of select="$chosen-msd"/>
639 </xsl:element>
640 </xsl:if>
641 <xsl:if test="$my_morph-seg//tei:fs/tei:f[@name eq 'nps']">
642 <xsl:element name="f" namespace="http://www.tei-c.org/ns/1.0">
643 <xsl:attribute name="name" select="'join'"/>
644 <xsl:value-of select="'left'"/>
645 </xsl:element>
646 </xsl:if>
647 </xsl:element>
648 </xsl:element>
649 </xsl:element>
650 </xsl:element>
bansp3e5b20c2022-03-18 20:22:31 +0100651 </xsl:template>
banspe726b4a2022-03-28 05:47:45 +0200652
Piotr Banskic5950ce2022-05-27 15:07:08 +0200653 <!-- ************************** named entities ******************* -->
654
655 <xsl:template name="create_named">
656 <xsl:param name="compoundID" as="xs:string"/>
Piotr Banski65a6d0b2022-05-31 17:23:08 +0200657 <xsl:param name="ann_segmentation.xml" as="document-node()"/> <!-- probably out -->
Piotr Banskic5950ce2022-05-27 15:07:08 +0200658 <xsl:param name="ann_morphosyntax.xml" as="document-node()"/>
659 <xsl:param name="ann_named.xml" as="document-node()"/>
660 <xsl:param name="target" as="xs:string"/>
661
662 <xsl:result-document encoding="UTF-8" method="xml" indent="yes"
663 xpath-default-namespace="http://ids-mannheim.de/ns/KorAP" href="{$target}">
664 <xsl:processing-instruction name="xml-model">href=&quot;span.rng&quot; type=&quot;application/xml&quot; schematypens=&quot;http://relaxng.org/ns/structure/1.0&quot;</xsl:processing-instruction>
665 <xsl:element name="layer" namespace="{$KorAP_namespace}">
666 <xsl:attribute name="docid" select="$compoundID"/>
667 <xsl:attribute name="version" select="$KorAP-XML_version"/>
668
669 <xsl:element name="spanList" namespace="{$KorAP_namespace}">
Piotr Banski65a6d0b2022-05-31 17:23:08 +0200670 <xsl:apply-templates select="$ann_morphosyntax.xml//tei:text" mode="named">
671 <!--<xsl:with-param name="ann_segmentation.xml" select="$ann_segmentation.xml" as="document-node()" tunnel="yes"/>-->
Piotr Banskic5950ce2022-05-27 15:07:08 +0200672 <xsl:with-param name="ann_named.xml" select="$ann_named.xml" as="document-node()" tunnel="yes"/>
Piotr Banski65a6d0b2022-05-31 17:23:08 +0200673 </xsl:apply-templates>
Piotr Banskic5950ce2022-05-27 15:07:08 +0200674 </xsl:element>
675 </xsl:element>
676 </xsl:result-document>
677 </xsl:template>
Piotr Banski65a6d0b2022-05-31 17:23:08 +0200678
679 <xsl:template match="tei:seg" mode="named"/>
680
681 <xsl:template match="tei:seg[tei:fs[tei:f[@name eq 'disamb']]]" mode="named">
682 <xsl:param name="ann_named.xml" as="document-node()" tunnel="yes"/>
683
684 <xsl:variable name="offsets" as="xs:integer+">
685 <xsl:sequence select="map:get(fn:accumulator-after('morpho-offsets')[last()], string(@xml:id))"/>
686 </xsl:variable>
687
688 <xsl:variable name="my_id" select="@xml:id" as="xs:string"/>
689 <xsl:variable name="my_index" select="fn:accumulator-before('element-index')" as="xs:integer"/>
690
691 <xsl:element name="span" namespace="{$KorAP_namespace}">
692 <xsl:attribute name="id" select="'n' || $my_index"/>
693 <xsl:attribute name="from" select="$offsets[1]"/>
694 <xsl:attribute name="to" select="$offsets[2]"/>
695 <xsl:attribute name="l" select="f:compute_nesting(.)"/>
696 <xsl:element name="fs" namespace="http://www.tei-c.org/ns/1.0">
697 <xsl:element name="f" namespace="http://www.tei-c.org/ns/1.0">
698 <xsl:attribute name="name" select="'ne'"/>
699 <xsl:comment select="(if(tei:fs/tei:f[@name eq 'nps']) then ' ' else '_') || tei:fs/tei:f[@name eq 'orth']/tei:string"/>
700 <xsl:element name="fs" namespace="http://www.tei-c.org/ns/1.0">
701 <xsl:element name="f" namespace="http://www.tei-c.org/ns/1.0">
702 <xsl:attribute name="name" select="'ent'"/>
703 <xsl:value-of select="'placeholder'"/>
704 </xsl:element>
705 </xsl:element>
706 </xsl:element>
707 </xsl:element>
708 </xsl:element>
709 </xsl:template>
710
Piotr Banskic5950ce2022-05-27 15:07:08 +0200711
712 <!-- ************************** syntactic chunks ******************* -->
713
714 <xsl:template name="create_groups">
715 <xsl:param name="compoundID" as="xs:string"/>
716 <xsl:param name="ann_segmentation.xml" as="document-node()"/>
717 <xsl:param name="ann_morphosyntax.xml" as="document-node()"/>
718 <xsl:param name="ann_words.xml" as="document-node()"/>
719 <xsl:param name="ann_groups.xml" as="document-node()"/>
720 <xsl:param name="target" as="xs:string"/>
721
722 <xsl:result-document encoding="UTF-8" method="xml" indent="yes"
723 xpath-default-namespace="http://ids-mannheim.de/ns/KorAP" href="{$target}">
724 <xsl:processing-instruction name="xml-model">href=&quot;span.rng&quot; type=&quot;application/xml&quot; schematypens=&quot;http://relaxng.org/ns/structure/1.0&quot;</xsl:processing-instruction>
725 <xsl:element name="layer" namespace="{$KorAP_namespace}">
726 <xsl:attribute name="docid" select="$compoundID"/>
727 <xsl:attribute name="version" select="$KorAP-XML_version"/>
728
729 <xsl:element name="spanList" namespace="{$KorAP_namespace}">
730 <!--<xsl:apply-templates select="$ann_segmentation.xml//tei:text" mode="groups">
731 <xsl:with-param name="ann_morphosyntax.xml" select="$ann_morphosyntax.xml" as="document-node()" tunnel="yes"/>
732 <xsl:with-param name="ann_words.xml" select="$ann_words.xml" as="document-node()" tunnel="yes"/>
733 <xsl:with-param name="ann_groups.xml" select="$ann_groups.xml" as="document-node()" tunnel="yes"/>
734 </xsl:apply-templates>-->
735 </xsl:element>
736 </xsl:element>
737 </xsl:result-document>
738 </xsl:template>
739
bansp5f841732022-03-16 06:27:31 +0100740 <!-- ************************** TEXT header ******************* -->
741
742 <xsl:template name="create_text_header">
743 <xsl:param name="text.xml" as="document-node()"/>
banspe726b4a2022-03-28 05:47:45 +0200744 <xsl:param name="compoundID" as="xs:string"/>
bansp5f841732022-03-16 06:27:31 +0100745 <xsl:param name="target" as="xs:string"/>
746
747 <!-- create the local header.xml file -->
748 <xsl:result-document encoding="UTF-8" method="xml" indent="yes"
749 xpath-default-namespace="http://ids-mannheim.de/ns/KorAP" href="{$target}">
750
751 <idsHeader type="document" pattern="text" status="new" version="1.1" TEIform="teiHeader">
banspe726b4a2022-03-28 05:47:45 +0200752 <xsl:apply-templates select="$text.xml//tei:TEI/tei:teiHeader/tei:*" mode="text">
753 <xsl:with-param name="compoundID" as="xs:string" select="$compoundID" tunnel="yes"/>
754 </xsl:apply-templates>
bansp5f841732022-03-16 06:27:31 +0100755 </idsHeader>
756 </xsl:result-document>
757 </xsl:template>
758
759 <xsl:template match="tei:fileDesc" mode="text">
bansp9103aab2022-03-19 05:10:21 +0100760 <xsl:element name="{local-name()}">
bansp5f841732022-03-16 06:27:31 +0100761 <xsl:apply-templates mode="text"/>
bansp9103aab2022-03-19 05:10:21 +0100762 </xsl:element>
bansp5f841732022-03-16 06:27:31 +0100763 </xsl:template>
764
765 <xsl:template match="tei:title" mode="text">
766 <t.title>
767 <xsl:apply-templates/>
768 </t.title>
769 </xsl:template>
770
771 <xsl:template match="tei:titleStmt" mode="text">
banspe726b4a2022-03-28 05:47:45 +0200772 <xsl:param name="compoundID" as="xs:string" tunnel="yes"/>
bansp5f841732022-03-16 06:27:31 +0100773 <titleStmt>
774 <textSigle>
banspe726b4a2022-03-28 05:47:45 +0200775 <xsl:value-of select="$compoundID"/>
bansp5f841732022-03-16 06:27:31 +0100776 </textSigle>
777 <xsl:apply-templates mode="text"/>
778 </titleStmt>
779 </xsl:template>
780
bansp9103aab2022-03-19 05:10:21 +0100781 <xsl:template match="tei:publicationStmt" mode="text">
782 <xsl:element name="{local-name()}">
783 <xsl:apply-templates mode="text"/>
784 </xsl:element>
785 </xsl:template>
786
787 <xsl:template match="tei:availability" mode="text">
788 <xsl:element name="{local-name()}">
789 <xsl:apply-templates mode="text" select="@* | *"/>
790 </xsl:element>
791 </xsl:template>
792
793 <xsl:template match="tei:profileDesc" mode="text">
794 <xsl:element name="{local-name()}">
795 <xsl:apply-templates mode="text"/>
796 </xsl:element>
797 </xsl:template>
bansp5f841732022-03-16 06:27:31 +0100798
bansp9103aab2022-03-19 05:10:21 +0100799 <xsl:template match="tei:textClass" mode="text">
800 <xsl:element name="{local-name()}">
801 <xsl:apply-templates mode="text" select="@* | *"/>
802 </xsl:element>
803 </xsl:template>
804
805 <xsl:template match="tei:catRef" mode="text corpus">
806 <xsl:element name="{local-name()}">
807 <xsl:apply-templates mode="text" select="@* | *"/>
808 </xsl:element>
809 </xsl:template>
810
811 <xsl:template match="@status | @scheme | @target | @type | @xml:id[ancestor::tei:classDecl] | @xml:lang" mode="text corpus">
812 <xsl:copy-of select="."/>
813 </xsl:template>
814
815 <xsl:template match="tei:p" mode="text corpus">
816 <xsl:element name="{local-name()}">
817 <xsl:apply-templates mode="header-text"/>
818 </xsl:element>
819 </xsl:template>
820
821
822 <!-- OPTIMIZATION has to take modes into account -->
bansp5e2d1c02022-03-10 04:51:40 +0100823 <!-- ************************** CORPUS header ******************* -->
824 <xsl:template name="create_corpus_header">
825 <xsl:param name="text.xml" as="document-node()"/>
bansp5f841732022-03-16 06:27:31 +0100826 <xsl:param name="target" as="xs:string"/>
bansp5e2d1c02022-03-10 04:51:40 +0100827
828 <!-- create the corpus-level header.xml file -->
bansp5f841732022-03-16 06:27:31 +0100829 <xsl:result-document encoding="UTF-8" method="xml" indent="yes" href="{$target}">
bansp5e2d1c02022-03-10 04:51:40 +0100830
831 <!--doctype-public="{$publicDoctypeI5}"
832 doctype-system="{$systemDoctypeI5}">
833 these are, sadly, useless
834 -->
835
836 <idsHeader type="corpus" pattern="text" status="new" version="1.1" TEIform="teiHeader">
bansp9103aab2022-03-19 05:10:21 +0100837 <xsl:apply-templates select="$text.xml/tei:teiCorpus/tei:teiHeader/tei:*" mode="corpus"/>
bansp5e2d1c02022-03-10 04:51:40 +0100838 </idsHeader>
839 </xsl:result-document>
840 </xsl:template>
841
842 <xsl:template match="tei:fileDesc" mode="corpus">
bansp9103aab2022-03-19 05:10:21 +0100843 <xsl:element name="{local-name()}">
bansp5e2d1c02022-03-10 04:51:40 +0100844 <xsl:apply-templates mode="corpus"/>
bansp9103aab2022-03-19 05:10:21 +0100845 </xsl:element>
bansp5e2d1c02022-03-10 04:51:40 +0100846 </xsl:template>
bansp9103aab2022-03-19 05:10:21 +0100847
bansp5e2d1c02022-03-10 04:51:40 +0100848
849 <xsl:template match="tei:title" mode="corpus">
850 <c.title>
bansp9103aab2022-03-19 05:10:21 +0100851 <xsl:apply-templates mode="corpus" select="@*"/>
852 <xsl:apply-templates mode="header-text"/>
bansp5e2d1c02022-03-10 04:51:40 +0100853 </c.title>
854 </xsl:template>
855
856 <xsl:template match="tei:titleStmt" mode="corpus">
857 <titleStmt>
858 <korpusSigle>
859 <xsl:value-of select="$corpusID"/>
860 </korpusSigle>
861 <xsl:apply-templates mode="corpus"/>
862 </titleStmt>
863 </xsl:template>
864
bansp9103aab2022-03-19 05:10:21 +0100865 <xsl:template match="tei:publicationStmt" mode="corpus">
866 <xsl:element name="{local-name()}">
867 <xsl:apply-templates mode="corpus"/>
868 </xsl:element>
869 </xsl:template>
870
871 <xsl:template match="tei:availability" mode="corpus">
872 <xsl:element name="{local-name()}">
873 <xsl:apply-templates mode="corpus" select="@* | *"/>
874 </xsl:element>
875 </xsl:template>
876
877 <xsl:template match="tei:encodingDesc" mode="corpus">
878 <xsl:element name="{local-name()}">
879 <xsl:apply-templates mode="corpus"/>
880 </xsl:element>
881 </xsl:template>
882
883 <xsl:template match="tei:classDecl | tei:taxonomy | tei:category | tei:taxonomy/tei:bibl" mode="corpus">
884 <xsl:element name="{local-name()}">
885 <xsl:apply-templates mode="corpus" select="@* | *"/>
886 </xsl:element>
887 </xsl:template>
888
889 <xsl:template match="tei:bibl/tei:title | tei:edition | tei:desc" mode="corpus">
890 <xsl:element name="{local-name()}">
891 <xsl:apply-templates mode="corpus" select="@*"/>
892 <xsl:apply-templates mode="header-text"/>
893 </xsl:element>
894 </xsl:template>
895<!--
896 <xsl:template match="tei:textClass" mode="corpus">
897 <xsl:element name="{local-name()}">
898 <xsl:apply-templates mode="corpus" select="@* | *"/>
899 </xsl:element>
900 </xsl:template>
901
902 <xsl:template match="tei:catRef" mode="corpus">
903 <xsl:element name="{local-name()}">
904 <xsl:apply-templates mode="corpus" select="@* | *"/>
905 </xsl:element>
906 </xsl:template>
907-->
bansp5e2d1c02022-03-10 04:51:40 +0100908
909
910
911 <!-- this template can be called by the XSPEC test; TODO: find a way to call the main() template directly -->
912 <!-- I have not fully handled the param transmission, which would have to be kludged in just for the sake of XSPec,
913 because I'm disabling this for now, due to XSpec design issues; relevant links, a.o.:
914
915 https://stackoverflow.com/questions/64933277/what-is-the-cause-of-error-cannot-execute-xslresult-document-while-evaluating
916 https://www.balisage.net/Proceedings/vol25/html/Galtman01/BalisageVol25-Galtman01.html
917
918 In short: the internal design of XSpec forces kludges when one wants to use xsl:result-document in their stylesheets. But I don't
919 want to be strangled by kludges at the beginning of work, I've already lost quite a bit of time on this investigation,
920 I will therefore "just code" and then can think of externalizing bits of templates if we want to play with tests. For now,
921 I don't want to have to handle context items is a special way inside variables, etc., because I'm not sure it's worth it.
922
923 -->
924 <!--<xsl:template name="test_full">
925 <xsl:param name="corpusID"/>
926 <xsl:param name="docID"/>
927 <xsl:param name="textID"/>
928 <xsl:call-template name="xsl:initial-template"/>
929 </xsl:template>-->
930
Akron9a8ee3e2022-01-31 13:51:49 +0100931</xsl:stylesheet>
Piotr Banski6a4a2522022-05-24 01:16:47 +0200932
Piotr Banskifdc858a2022-05-25 02:40:32 +0200933<!-- template for serializing maps in messages <xsl:message select="('map:',serialize($map, map{'method':'adaptive'}))"/> -->