blob: 9430c701c8659b02a2f2a6b3cb3656f1eaa4a56b [file] [log] [blame]
Akron9a8ee3e2022-01-31 13:51:49 +01001<?xml version="1.0" encoding="UTF-8"?>
2<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
bansp5e2d1c02022-03-10 04:51:40 +01003 xmlns:xs="http://www.w3.org/2001/XMLSchema" xmlns:nkjp="http://www.nkjp.pl/ns/1.0"
4 xmlns:tei="http://www.tei-c.org/ns/1.0" xmlns:f="func"
5 xmlns:map="http://www.w3.org/2005/xpath-functions/map" exclude-result-prefixes="xs f map nkjp tei"
6 version="3.0" expand-text="yes">
Akron9a8ee3e2022-01-31 13:51:49 +01007
banspe726b4a2022-03-28 05:47:45 +02008
9<!-- PARAMETERS -->
bansp5e2d1c02022-03-10 04:51:40 +010010
bansp8f6700b2022-03-27 05:27:09 +020011 <xsl:param name="sourceDir" select="'test/resources/nkjp2korap_sample2'" as="xs:string"/>
12 <!-- the directory containing NKJP files, in the form of a collection of text-level dirs -->
Akron9a8ee3e2022-01-31 13:51:49 +010013
bansp8f6700b2022-03-27 05:27:09 +020014 <xsl:param name="targetDir" select="'test/output'" as="xs:string"/>
banspf2b24e62022-03-28 18:12:08 +020015
16 <xsl:param name="skip_docID" as="xs:string">
banspb5992532022-03-29 15:55:44 +020017 <xsl:value-of select="'HellerPodgladanie,IsakowiczZaleskiMoje,KolakowskiOco,MysliwskiKamien,WilkWilczy,ZycieWarszawy_Zycie'"/>
18 </xsl:param>
19 <!-- comma-separated list of document IDs to be skipped from processing
banspf2b24e62022-03-28 18:12:08 +020020 example: HellerPodgladanie,KOT
21 no functionality beyond string identity is supported -->
banspb5992532022-03-29 15:55:44 +020022
bansp8f6700b2022-03-27 05:27:09 +020023
banspe726b4a2022-03-28 05:47:45 +020024<!-- VARIABLES -->
25
26 <xsl:variable name="corpusID" as="xs:string" select="'NKJP'" static="yes"/>
27 <xsl:variable name="docID" as="xs:string" select="'NKJP'" static="yes"/>
bansp8f6700b2022-03-27 05:27:09 +020028
29 <xsl:variable name="targetCorpusDir_slashed" select="$targetDir || '/' || $corpusID || '/'" as="xs:string"/>
bansp5e2d1c02022-03-10 04:51:40 +010030
31 <xsl:variable name="systemDoctypeI5"
bansp97ba7ce2022-03-26 05:14:06 +010032 select="'http://corpora.ids-mannheim.de/I5/DTD/i5.dtd'" as="xs:string"
bansp5e2d1c02022-03-10 04:51:40 +010033 static="true"/>
34
bansp97ba7ce2022-03-26 05:14:06 +010035 <xsl:variable name="publicDoctypeI5" select="'-//IDS//DTD I5 1.0//EN'" as="xs:string"
bansp5e2d1c02022-03-10 04:51:40 +010036 static="true"/>
37
38 <xsl:variable name="KorAP_namespace" select="'http://ids-mannheim.de/ns/KorAP'" static="true"
39 as="xs:string"/>
40
bansp5f841732022-03-16 06:27:31 +010041 <xsl:variable name="KorAP-XML_version" select="'KorAP-0.4'" as="xs:string" static="true"/>
42 <!-- this is only a bit funny -->
43
banspe726b4a2022-03-28 05:47:45 +020044 <xsl:variable name="collection_params" as="xs:string" static="yes"
45 select="'recurse=yes;validation=strip;select=text.xml;content-type=application/xml;on-error=warning;xinclude=yes'"
46 />
47 <!-- see https://www.saxonica.com/documentation11/index.html#!sourcedocs/collections/collection-directories -->
48
49 <xsl:variable name="collection_of_text" select="collection($sourceDir || '?' || $collection_params)" as="document-node()+"/>
banspb5992532022-03-29 15:55:44 +020050
51
banspe726b4a2022-03-28 05:47:45 +020052<!-- MODES -->
bansp5e2d1c02022-03-10 04:51:40 +010053
54 <xsl:mode name="corpus" on-no-match="deep-skip"/>
55 <xsl:mode name="text" on-no-match="deep-skip"/>
bansp9103aab2022-03-19 05:10:21 +010056 <xsl:mode name="header-text" on-no-match="text-only-copy"/>
bansp5e2d1c02022-03-10 04:51:40 +010057
banspe726b4a2022-03-28 05:47:45 +020058
59 <!-- FUNCTIONS -->
60
bansp5f841732022-03-16 06:27:31 +010061 <xsl:function name="f:compute_nesting" as="xs:integer">
62 <xsl:param name="node" as="node()"/>
63 <xsl:variable name="rel_depth"
64 select="count($node/ancestor-or-self::*[local-name(.) ne 'TEI'][local-name(.) ne 'teiCorpus'])"
65 as="xs:integer"/>
66<!-- I think my skills are lacking -->
67 <xsl:sequence select="$rel_depth"/>
68 </xsl:function>
69
70 <xsl:function name="f:calc_content_length" as="xs:integer">
71 <xsl:param name="node" as="node()"/>
72 <xsl:choose>
73 <xsl:when test="$node/self::tei:text or $node/self::tei:body">
74 <xsl:variable name="last_corresp"
75 select="$node/descendant::tei:p[last()]/descendant::tei:s[last()]/descendant::tei:seg[last()]/attribute::corresp"
76 as="attribute(corresp)"/>
77 <xsl:variable name="numbers" select="substring-after(substring-before($last_corresp,')'),',')"/>
78 <xsl:sequence select="xs:integer(substring-before($numbers,',')) + xs:integer(substring-after($numbers,','))"/>
79 </xsl:when>
80 <xsl:when test="$node/self::tei:p">
81 <xsl:variable name="last_corresp"
82 select="$node/descendant::tei:s[last()]/descendant::tei:seg[last()]/attribute::corresp"
83 as="attribute(corresp)"/>
84 <xsl:variable name="numbers" select="substring-after(substring-before($last_corresp,')'),',')"/>
85 <xsl:sequence select="xs:integer(substring-before($numbers,',')) + xs:integer(substring-after($numbers,','))"/>
86 </xsl:when>
87 <xsl:when test="$node/self::tei:s">
88 <xsl:variable name="last_corresp"
89 select="$node/descendant::tei:seg[last()]/attribute::corresp"
90 as="attribute(corresp)"/>
91 <xsl:variable name="numbers" select="substring-after(substring-before($last_corresp,')'),',')"/>
92 <xsl:sequence select="xs:integer(substring-before($numbers,',')) + xs:integer(substring-after($numbers,','))"/>
93 </xsl:when>
94 <xsl:otherwise>
95 <xsl:variable name="numbers" select="substring-after(substring-before($node/@corresp,')'),',')"/>
96 <xsl:sequence select="xs:integer(substring-before($numbers,',')) + xs:integer(substring-after($numbers,','))"/>
97 </xsl:otherwise>
98 </xsl:choose>
99 </xsl:function>
bansp5e2d1c02022-03-10 04:51:40 +0100100
banspb5992532022-03-29 15:55:44 +0200101
102<!-- UTILITY TEMPLATES -->
103
bansp9103aab2022-03-19 05:10:21 +0100104 <xsl:template match="@default" mode="#all"/>
bansp97ba7ce2022-03-26 05:14:06 +0100105 <!-- this is to delete some auto-inserted attribute throughout -->
bansp9103aab2022-03-19 05:10:21 +0100106
banspe726b4a2022-03-28 05:47:45 +0200107 <xsl:template match="tei:w" mode="#all"/>
108<!-- NKJP-SGJP has apparently resigned from standoff representations by adding <w> everywhere;
109 for the time being, we'll just stick to the standoff offsets, although that may need to
110 be revisited as the NKJP format has now began to stray from its schemas and assumptions -->
bansp8f6700b2022-03-27 05:27:09 +0200111
banspe726b4a2022-03-28 05:47:45 +0200112 <xsl:template match="tei:choice" mode="#all"/>
113<!-- THIS IS ONLY TEMPORARY,
114 because an interesting challenge came up where I will
115 probably have to abandon straightforward mapping because of TOKENIZATION alternatives;
116
117 but now, I just want this stylesheet to work, even if it eats some occasional token (which it now does, 'komuÅ›' and 'czym' vanish)
118 -->
bansp8f6700b2022-03-27 05:27:09 +0200119
banspb5992532022-03-29 15:55:44 +0200120
121 <!-- MAIN PROCESSING -->
122
123
bansp5e2d1c02022-03-10 04:51:40 +0100124 <xsl:template name="xsl:initial-template">
banspf2b24e62022-03-28 18:12:08 +0200125 <xsl:variable name="IDs_to_skip" select="tokenize($skip_docID,',')" as="xs:string*"/>
126
banspe726b4a2022-03-28 05:47:45 +0200127 <!-- we only want to call the template below once, and we process a random NKJP corpus file for that purpose,
bansp8f6700b2022-03-27 05:27:09 +0200128 because all we need is the main corpus header, and we can (should) get to that from any NKJP corpus document -->
129 <xsl:call-template name="create_corpus_header">
banspe726b4a2022-03-28 05:47:45 +0200130 <xsl:with-param name="text.xml" select="$collection_of_text[1]" as="document-node()"/>
bansp8f6700b2022-03-27 05:27:09 +0200131 <xsl:with-param name="target" select="$targetCorpusDir_slashed || 'header.xml'" as="xs:string"/>
132 </xsl:call-template>
133
banspe726b4a2022-03-28 05:47:45 +0200134 <xsl:for-each select="$collection_of_text">
135 <xsl:variable name="my_dir" as="xs:string" select="replace(base-uri(),'/text\.xml','')"/>
136 <xsl:variable name="my_textID" as="xs:string" select="tokenize($my_dir,'/')[last()]"/>
137 <xsl:variable name="ann_morphosyntax.uri" select="$my_dir || '/ann_morphosyntax.xml'" as="xs:string"/>
138 <xsl:variable name="ann_segmentation.uri" select="$my_dir || '/ann_segmentation.xml'" as="xs:string"/>
139
banspf2b24e62022-03-28 18:12:08 +0200140 <xsl:choose>
141 <xsl:when test="$my_textID = $IDs_to_skip"/>
142 <xsl:otherwise>
banspb5992532022-03-29 15:55:44 +0200143 <xsl:call-template name="process_single_sample">
banspf2b24e62022-03-28 18:12:08 +0200144 <xsl:with-param name="text.xml" as="document-node()" select="."/>
145 <xsl:with-param name="ann_morphosyntax.xml" as="document-node()"
146 select="doc($ann_morphosyntax.uri)"/>
147 <xsl:with-param name="ann_segmentation.xml" as="document-node()"
148 select="doc($ann_segmentation.uri)"/>
149 <xsl:with-param name="my_textID" select="$my_textID" as="xs:string"/>
banspb5992532022-03-29 15:55:44 +0200150 </xsl:call-template>
banspf2b24e62022-03-28 18:12:08 +0200151 </xsl:otherwise>
152 </xsl:choose>
banspe726b4a2022-03-28 05:47:45 +0200153 </xsl:for-each>
bansp8f6700b2022-03-27 05:27:09 +0200154 </xsl:template>
155
156 <xsl:template name="process_single_sample">
banspe726b4a2022-03-28 05:47:45 +0200157 <xsl:param name="text.xml" as="document-node()"/>
158 <xsl:param name="ann_morphosyntax.xml" as="document-node()"/>
159 <xsl:param name="ann_segmentation.xml" as="document-node()"/>
160 <xsl:param name="my_textID" as="xs:string" select="'0BAD_textID'"/>
161
162 <xsl:variable name="targetBaseDir" as="xs:string" select="$targetCorpusDir_slashed || $docID || '/' || $my_textID"/>
163
164 <xsl:variable name="compoundID" as="xs:string"
165 select="$corpusID || '_' || $docID || '.' || $my_textID"/>
166 <!-- this is what occurs in the text and data layers as @docid -->
167
168
bansp5e2d1c02022-03-10 04:51:40 +0100169 <xsl:call-template name="create_data">
170 <xsl:with-param name="text.xml" select="$text.xml" as="document-node()"/>
banspe726b4a2022-03-28 05:47:45 +0200171 <xsl:with-param name="compoundID" select="$compoundID" as="xs:string"/>
172 <xsl:with-param name="target" select="$targetBaseDir || '/data.xml'" as="xs:string"/>
bansp5f841732022-03-16 06:27:31 +0100173 </xsl:call-template>
174
175 <xsl:call-template name="create_struct">
176 <xsl:with-param name="text.xml" select="$text.xml" as="document-node()"/>
banspe726b4a2022-03-28 05:47:45 +0200177 <xsl:with-param name="compoundID" select="$compoundID" as="xs:string"/>
bansp5f841732022-03-16 06:27:31 +0100178 <xsl:with-param name="ann_segmentation.xml" select="$ann_segmentation.xml"
179 as="document-node()"/>
banspe726b4a2022-03-28 05:47:45 +0200180 <xsl:with-param name="target" select="$targetBaseDir || '/struct/structure.xml'" as="xs:string"
bansp5f841732022-03-16 06:27:31 +0100181 />
182 </xsl:call-template>
183
184 <xsl:call-template name="create_morpho">
185 <xsl:with-param name="text.xml" select="$text.xml" as="document-node()"/>
banspe726b4a2022-03-28 05:47:45 +0200186 <xsl:with-param name="compoundID" select="$compoundID" as="xs:string"/>
bansp3e5b20c2022-03-18 20:22:31 +0100187 <xsl:with-param name="ann_segmentation.xml" select="$ann_segmentation.xml"
188 as="document-node()"/>
bansp5f841732022-03-16 06:27:31 +0100189 <xsl:with-param name="ann_morphosyntax.xml" select="$ann_morphosyntax.xml"
190 as="document-node()"/>
banspe726b4a2022-03-28 05:47:45 +0200191 <xsl:with-param name="target" select="$targetBaseDir || '/nkjp/morpho.xml'" as="xs:string"/>
bansp5e2d1c02022-03-10 04:51:40 +0100192 </xsl:call-template>
193
194 <xsl:call-template name="create_text_header">
195 <xsl:with-param name="text.xml" select="$text.xml" as="document-node()"/>
banspe726b4a2022-03-28 05:47:45 +0200196 <xsl:with-param name="compoundID" select="$compoundID" as="xs:string"/>
197 <xsl:with-param name="target" select="$targetBaseDir || '/header.xml'" as="xs:string"/>
bansp5e2d1c02022-03-10 04:51:40 +0100198 </xsl:call-template>
199
bansp5e2d1c02022-03-10 04:51:40 +0100200 </xsl:template>
201
202 <!-- ************************** data.xml ******************* -->
203
204 <xsl:template name="create_data">
205 <xsl:param name="text.xml" as="document-node()"/>
banspe726b4a2022-03-28 05:47:45 +0200206 <xsl:param name="compoundID" as="xs:string"/>
bansp5f841732022-03-16 06:27:31 +0100207 <xsl:param name="target" as="xs:string"/>
bansp5e2d1c02022-03-10 04:51:40 +0100208 <!-- create the data.xml file -->
209 <xsl:result-document encoding="UTF-8" method="xml" indent="yes"
bansp5f841732022-03-16 06:27:31 +0100210 xpath-default-namespace="{$KorAP_namespace}" href="{$target}">
bansp5e2d1c02022-03-10 04:51:40 +0100211
Akron9a8ee3e2022-01-31 13:51:49 +0100212 <xsl:processing-instruction name="xml-model">href=&quot;text.rng&quot; type=&quot;application/xml&quot; schematypens=&quot;http://relaxng.org/ns/structure/1.0&quot;</xsl:processing-instruction>
bansp5e2d1c02022-03-10 04:51:40 +0100213 <xsl:element name="raw_text" namespace="{$KorAP_namespace}">
bansp5f841732022-03-16 06:27:31 +0100214 <xsl:attribute name="docid" select="$compoundID"/>
bansp5e2d1c02022-03-10 04:51:40 +0100215 <xsl:element name="metadata" namespace="{$KorAP_namespace}">
216 <xsl:attribute name="file" select="'metadata.xml'"/>
217 </xsl:element>
218
219 <xsl:element name="text" namespace="{$KorAP_namespace}">
banspf79443e2022-02-25 14:25:33 +0100220 <xsl:value-of select="$text.xml//*[local-name() = 'ab']"/>
bansp5e2d1c02022-03-10 04:51:40 +0100221 </xsl:element>
Akron9a8ee3e2022-01-31 13:51:49 +0100222 </xsl:element>
banspf79443e2022-02-25 14:25:33 +0100223 </xsl:result-document>
Akron9a8ee3e2022-01-31 13:51:49 +0100224 </xsl:template>
225
bansp5f841732022-03-16 06:27:31 +0100226 <!-- ************************** struct ******************* -->
227
228 <xsl:template name="create_struct">
229 <xsl:param name="text.xml" as="document-node()"/>
banspe726b4a2022-03-28 05:47:45 +0200230 <xsl:param name="compoundID" as="xs:string"/>
bansp5f841732022-03-16 06:27:31 +0100231 <xsl:param name="ann_segmentation.xml" as="document-node()"/>
232 <xsl:param name="target" as="xs:string"/>
233
234 <xsl:result-document encoding="UTF-8" method="xml" indent="yes"
235 xpath-default-namespace="http://ids-mannheim.de/ns/KorAP" href="{$target}">
236 <xsl:processing-instruction name="xml-model">href=&quot;span.rng&quot; type=&quot;application/xml&quot; schematypens=&quot;http://relaxng.org/ns/structure/1.0&quot;</xsl:processing-instruction>
237 <xsl:element name="layer" namespace="{$KorAP_namespace}">
238 <xsl:attribute name="docid" select="$compoundID"/>
239 <xsl:attribute name="version" select="$KorAP-XML_version"/>
240
241 <xsl:element name="spanList" namespace="{$KorAP_namespace}">
242 <xsl:apply-templates select="$ann_segmentation.xml//tei:text" mode="struct"/>
243 </xsl:element>
244 </xsl:element>
245 </xsl:result-document>
246 </xsl:template>
247
248 <xsl:template match="tei:*" mode="struct">
249 <xsl:param name="ini" as="xs:integer" required="no" select="0"/>
250 <xsl:param name="fin" as="xs:integer" required="no" select="999999999"/>
251 <xsl:param name="index" as="xs:integer" required="no" select="1"/>
252 <!-- I have made a major mess here, but it works... it's so spread out
253 because I wanted to make sure to be able to look up the individual
bansp3e5b20c2022-03-18 20:22:31 +0100254 constituent values, should anything go wrong; optimization will come when it's worked against a larger dataset -->
bansp5f841732022-03-16 06:27:31 +0100255 <xsl:variable name="my_name" select="local-name()" as="xs:string"/>
256 <xsl:variable name="preceding" select="preceding-sibling::*[local-name(.) eq $my_name]"/>
257 <xsl:variable name="preceding-count" select="count($preceding)"/>
258 <xsl:variable name="outside-preceding-count" as="xs:integer">
259 <xsl:choose>
260 <xsl:when test="self::tei:s or self::tei:p">
261 <xsl:choose>
262 <xsl:when test="$preceding-count">
263 <xsl:sequence select="
264 sum(for $p in $preceding
265 return
266 count($p/descendant::*))"/>
267 </xsl:when>
268 <xsl:otherwise>
269 <xsl:sequence select="0"/>
270 </xsl:otherwise>
271 </xsl:choose>
272 </xsl:when>
273 <xsl:otherwise>
274 <xsl:sequence select="0"/>
275 </xsl:otherwise>
276 </xsl:choose>
277 </xsl:variable>
278 <xsl:variable name="my_index" select="$index + 1 + $preceding-count + $outside-preceding-count"
279 as="xs:integer"/>
banspb5992532022-03-29 15:55:44 +0200280
281
282 <!--<xsl:copy select="//tei:seg[count(@nkjp:rejected) ne 0 and @nkjp:rejected ne 'true']"></xsl:copy>-->
bansp5f841732022-03-16 06:27:31 +0100283
284 <xsl:variable name="start" as="xs:integer">
285 <xsl:choose>
286 <xsl:when test="self::tei:text or self::tei:body">
287 <xsl:sequence select="0"/>
288 </xsl:when>
289 <xsl:when test="self::tei:p">
290 <xsl:variable name="first_corresp"
291 select="descendant::tei:s[1]/descendant::tei:seg[1]/attribute::corresp"
292 as="attribute(corresp)"/>
293 <xsl:variable name="numbers" select="substring-after(substring-before($first_corresp,')'),',')"/>
294 <xsl:sequence select="xs:integer(substring-before($numbers,','))"/>
295 </xsl:when>
296 <xsl:when test="self::tei:s">
297 <xsl:variable name="first_corresp"
298 select="descendant::tei:seg[1]/attribute::corresp"
299 as="attribute(corresp)"/>
300 <xsl:variable name="numbers" select="substring-after(substring-before($first_corresp,')'),',')"/>
301 <xsl:sequence select="xs:integer(substring-before($numbers,','))"/>
302 </xsl:when>
303 <xsl:when test="self::tei:seg">
304 <xsl:variable name="numbers" select="substring-after(substring-before(@corresp,')'),',')"/>
305 <xsl:sequence select="xs:integer(substring-before($numbers,','))"/>
306 </xsl:when>
307 </xsl:choose>
308 </xsl:variable>
309 <xsl:variable name="end" as="xs:integer" select="f:calc_content_length(.)">
310 </xsl:variable>
bansp3e5b20c2022-03-18 20:22:31 +0100311
bansp5f841732022-03-16 06:27:31 +0100312 <xsl:element name="span" namespace="{$KorAP_namespace}">
313 <xsl:attribute name="id" select="'s' || $my_index"/>
314 <xsl:attribute name="from" select="$start"/>
315 <xsl:attribute name="to" select="$end"/>
316 <xsl:attribute name="l" select="f:compute_nesting(.)"/>
317 <xsl:element name="fs" namespace="http://www.tei-c.org/ns/1.0">
bansp3e5b20c2022-03-18 20:22:31 +0100318 <xsl:attribute name="type" select="'struct'"></xsl:attribute> <!-- STRUCT vs. LEX -->
bansp5f841732022-03-16 06:27:31 +0100319 <xsl:element name="f" namespace="http://www.tei-c.org/ns/1.0">
bansp3e5b20c2022-03-18 20:22:31 +0100320 <xsl:attribute name="name" select="'name'"/>
321 <xsl:value-of select="local-name()"/>
bansp5f841732022-03-16 06:27:31 +0100322 </xsl:element>
323 <xsl:if test="count(@*)">
324 <xsl:element name="f" namespace="http://www.tei-c.org/ns/1.0">
325 <xsl:attribute name="name" select="'attr'"/>
326 <xsl:element name="fs" namespace="http://www.tei-c.org/ns/1.0">
327 <xsl:attribute name="type" select="'attr'"/>
328 <xsl:for-each select="@*">
329 <xsl:element name="f" namespace="http://www.tei-c.org/ns/1.0">
330 <xsl:attribute name="name" select="local-name(.)"/>
331 <xsl:value-of select="."/>
332 </xsl:element>
333 </xsl:for-each>
334 </xsl:element>
335 </xsl:element>
336 </xsl:if>
337 </xsl:element>
338 </xsl:element>
339 <xsl:apply-templates mode="struct">
340 <xsl:with-param name="ini" select="$start" as="xs:integer"/>
341 <xsl:with-param name="fin" select="$end" as="xs:integer"/>
342 <xsl:with-param name="index" select="$my_index"/>
343 </xsl:apply-templates>
344 </xsl:template>
345
346 <!-- ************************** morpho ******************* -->
347
348 <xsl:template name="create_morpho">
349 <xsl:param name="text.xml" as="document-node()"/>
banspe726b4a2022-03-28 05:47:45 +0200350 <xsl:param name="compoundID" as="xs:string"/>
bansp3e5b20c2022-03-18 20:22:31 +0100351 <xsl:param name="ann_segmentation.xml" as="document-node()"/>
bansp5f841732022-03-16 06:27:31 +0100352 <xsl:param name="ann_morphosyntax.xml" as="document-node()"/>
353 <xsl:param name="target" as="xs:string"/>
354
355 <xsl:result-document encoding="UTF-8" method="xml" indent="yes"
356 xpath-default-namespace="http://ids-mannheim.de/ns/KorAP" href="{$target}">
357 <xsl:processing-instruction name="xml-model">href=&quot;span.rng&quot; type=&quot;application/xml&quot; schematypens=&quot;http://relaxng.org/ns/structure/1.0&quot;</xsl:processing-instruction>
bansp3e5b20c2022-03-18 20:22:31 +0100358 <xsl:element name="layer" namespace="{$KorAP_namespace}">
359 <xsl:attribute name="docid" select="$compoundID"/>
360 <xsl:attribute name="version" select="$KorAP-XML_version"/>
361
362 <xsl:element name="spanList" namespace="{$KorAP_namespace}">
363 <xsl:apply-templates select="$ann_segmentation.xml//tei:text" mode="morpho">
364 <xsl:with-param name="ann_morphosyntax.xml" select="$ann_morphosyntax.xml" as="document-node()"/>
365 </xsl:apply-templates>
366 </xsl:element>
367 </xsl:element>
bansp5f841732022-03-16 06:27:31 +0100368 </xsl:result-document>
369 </xsl:template>
370
bansp3e5b20c2022-03-18 20:22:31 +0100371 <xsl:template match="tei:*" mode="morpho">
372 <xsl:param name="ini" as="xs:integer" required="no" select="0"/>
373 <xsl:param name="fin" as="xs:integer" required="no" select="999999999"/>
374 <xsl:param name="index" as="xs:integer" required="no" select="1"/>
375 <xsl:param name="ann_morphosyntax.xml" as="document-node()"/>
376 <xsl:variable name="my_name" select="local-name()" as="xs:string"/>
377 <xsl:variable name="preceding" select="preceding-sibling::*[local-name(.) eq $my_name]"/>
378 <xsl:variable name="preceding-count" select="count($preceding)"/>
379 <xsl:variable name="outside-preceding-count" as="xs:integer">
380 <xsl:choose>
381 <xsl:when test="self::tei:s or self::tei:p">
382 <xsl:choose>
383 <xsl:when test="$preceding-count">
384 <xsl:sequence select="
385 sum(for $p in $preceding
386 return
387 count($p/descendant::*))"/>
388 </xsl:when>
389 <xsl:otherwise>
390 <xsl:sequence select="0"/>
391 </xsl:otherwise>
392 </xsl:choose>
393 </xsl:when>
394 <xsl:otherwise>
395 <xsl:sequence select="0"/>
396 </xsl:otherwise>
397 </xsl:choose>
398 </xsl:variable>
399 <xsl:variable name="my_index" select="$index + 1 + $preceding-count + $outside-preceding-count"
400 as="xs:integer"/>
401
402 <xsl:variable name="start" as="xs:integer">
403 <xsl:choose>
404 <xsl:when test="self::tei:text or self::tei:body">
405 <xsl:sequence select="0"/>
406 </xsl:when>
407 <xsl:when test="self::tei:p">
408 <xsl:variable name="first_corresp"
409 select="descendant::tei:s[1]/descendant::tei:seg[1]/attribute::corresp"
410 as="attribute(corresp)"/>
411 <xsl:variable name="numbers" select="substring-after(substring-before($first_corresp,')'),',')"/>
412 <xsl:sequence select="xs:integer(substring-before($numbers,','))"/>
413 </xsl:when>
414 <xsl:when test="self::tei:s">
415 <xsl:variable name="first_corresp"
416 select="descendant::tei:seg[1]/attribute::corresp"
417 as="attribute(corresp)"/>
418 <xsl:variable name="numbers" select="substring-after(substring-before($first_corresp,')'),',')"/>
419 <xsl:sequence select="xs:integer(substring-before($numbers,','))"/>
420 </xsl:when>
421 <!--<xsl:when test="self::tei:seg">
422 <xsl:variable name="numbers" select="substring-after(substring-before(@corresp,')'),',')"/>
423 <xsl:sequence select="xs:integer(substring-before($numbers,','))"/>
424 </xsl:when>-->
425 </xsl:choose>
426 </xsl:variable>
427 <xsl:variable name="end" as="xs:integer" select="f:calc_content_length(.)">
428 </xsl:variable>
429
430 <xsl:apply-templates mode="morpho">
431 <xsl:with-param name="ini" select="$start" as="xs:integer"/>
432 <xsl:with-param name="fin" select="$end" as="xs:integer"/>
433 <xsl:with-param name="index" select="$my_index"/>
434 <xsl:with-param name="ann_morphosyntax.xml" select="$ann_morphosyntax.xml" as="document-node()"/>
435 </xsl:apply-templates>
436 </xsl:template>
bansp5f841732022-03-16 06:27:31 +0100437
bansp3e5b20c2022-03-18 20:22:31 +0100438 <xsl:template match="tei:seg" mode="morpho">
439 <xsl:param name="ini" as="xs:integer" required="no" select="0"/>
440 <xsl:param name="fin" as="xs:integer" required="no" select="999999999"/>
441 <xsl:param name="index" as="xs:integer" required="no" select="1"/>
442 <xsl:param name="ann_morphosyntax.xml" as="document-node()"/>
443 <!-- I have made a major mess here, but it works... it's so spread out
444 because I wanted to make sure to be able to look up the individual
445 constituent values, should anything go wrong -->
446 <xsl:variable name="my_name" select="local-name()" as="xs:string"/>
447 <xsl:variable name="my_id" select="@xml:id" as="xs:string"/>
448 <xsl:variable name="my_morph-seg" as="node()" select="$ann_morphosyntax.xml//tei:seg[substring-after(@corresp,'#') eq $my_id]"/>
449 <xsl:variable name="my_disamb" select="$my_morph-seg//tei:fs/tei:f[@name eq 'disamb']" as="node()"/>
450 <xsl:variable name="my_choice-id" select="substring-after($my_disamb//tei:f[@name eq 'choice']/@fVal,'#')" as="xs:string"/>
451 <xsl:variable name="my_choice-lex" select="$my_morph-seg//tei:f[@name eq 'interps']/tei:fs[@type eq 'lex'][descendant::tei:symbol[@xml:id eq $my_choice-id]]" as="node()"/>
452 <xsl:variable name="chosen-msd" as="xs:string" select="$my_choice-lex/descendant::tei:symbol[@xml:id eq $my_choice-id]/@value"/>
453 <xsl:variable name="preceding" select="preceding-sibling::*[local-name(.) eq $my_name]"/>
454 <xsl:variable name="preceding-count" select="count($preceding)"/>
banspe726b4a2022-03-28 05:47:45 +0200455 <!--<xsl:variable name="outside-preceding-count" as="xs:integer">
bansp3e5b20c2022-03-18 20:22:31 +0100456 <xsl:choose>
banspe726b4a2022-03-28 05:47:45 +0200457 <xsl:when test="self::tei:s or self::tei:p"> <!-\- THIS NEEDS TO BE REVISITED AFTER THIS TEMPLATE HAS BECOME MORE SPECIFIC -\->
bansp3e5b20c2022-03-18 20:22:31 +0100458 <xsl:choose>
banspe726b4a2022-03-28 05:47:45 +0200459 <xsl:when test="$preceding-count"> commented out for now
bansp3e5b20c2022-03-18 20:22:31 +0100460 <xsl:sequence select="
461 sum(for $p in $preceding
462 return
463 count($p/descendant::*))"/>
464 </xsl:when>
465 <xsl:otherwise>
466 <xsl:sequence select="0"/>
467 </xsl:otherwise>
468 </xsl:choose>
469 </xsl:when>
470 <xsl:otherwise>
471 <xsl:sequence select="0"/>
472 </xsl:otherwise>
473 </xsl:choose>
banspe726b4a2022-03-28 05:47:45 +0200474 </xsl:variable>-->
475 <xsl:variable name="my_index" select="$index + 1 + $preceding-count" as="xs:integer"/>
bansp3e5b20c2022-03-18 20:22:31 +0100476
477 <xsl:variable name="start" as="xs:integer">
478 <xsl:variable name="numbers" select="substring-after(substring-before(@corresp,')'),',')"/>
479 <xsl:sequence select="xs:integer(substring-before($numbers,','))"/>
480 </xsl:variable>
481 <xsl:variable name="end" as="xs:integer" select="f:calc_content_length(.)">
482 </xsl:variable>
483 <xsl:element name="span" namespace="{$KorAP_namespace}">
484 <xsl:attribute name="id" select="'s' || $my_index"/>
485 <xsl:attribute name="from" select="$start"/>
486 <xsl:attribute name="to" select="$end"/>
487 <xsl:attribute name="l" select="f:compute_nesting(.)"/>
488 <xsl:element name="fs" namespace="http://www.tei-c.org/ns/1.0">
489 <xsl:attribute name="type" select="'lex'"/>
490 <xsl:element name="f" namespace="http://www.tei-c.org/ns/1.0">
491 <xsl:attribute name="name" select="'lex'"/>
492 <xsl:element name="fs" namespace="http://www.tei-c.org/ns/1.0">
493 <xsl:comment select="$my_morph-seg//tei:fs/tei:f[@name eq 'orth']/tei:string"/>
494
495
496 <xsl:element name="f" namespace="http://www.tei-c.org/ns/1.0">
497 <xsl:attribute name="name" select="'lemma'"/>
498 <xsl:value-of select="$my_choice-lex/tei:f[@name eq 'base']/tei:string"/>
499 </xsl:element>
500 <xsl:element name="f" namespace="http://www.tei-c.org/ns/1.0">
501 <xsl:attribute name="name" select="'pos'"/>
502 <xsl:value-of select="$my_choice-lex/tei:f[@name eq 'ctag']/tei:symbol/@value"/>
503 </xsl:element>
504 <xsl:if test="string-length($chosen-msd)">
505 <xsl:element name="f" namespace="http://www.tei-c.org/ns/1.0">
506 <xsl:attribute name="name" select="'msd'"/>
507 <xsl:value-of select="$chosen-msd"/>
508 </xsl:element>
509 </xsl:if>
510 <xsl:if test="$my_morph-seg//tei:fs/tei:f[@name eq 'nps']">
511 <xsl:element name="f" namespace="http://www.tei-c.org/ns/1.0">
512 <xsl:attribute name="name" select="'join'"/>
513 <xsl:value-of select="'left'"/>
514 </xsl:element>
515 </xsl:if>
516 </xsl:element>
517 </xsl:element>
518 </xsl:element>
519 </xsl:element>
banspe726b4a2022-03-28 05:47:45 +0200520 <xsl:apply-templates mode="morpho">
bansp3e5b20c2022-03-18 20:22:31 +0100521 <xsl:with-param name="ini" select="$start" as="xs:integer"/>
522 <xsl:with-param name="fin" select="$end" as="xs:integer"/>
523 <xsl:with-param name="index" select="$my_index"/>
524 <xsl:with-param name="ann_morphosyntax.xml" select="$ann_morphosyntax.xml" as="document-node()"/>
banspe726b4a2022-03-28 05:47:45 +0200525 </xsl:apply-templates>-->
bansp3e5b20c2022-03-18 20:22:31 +0100526 </xsl:template>
banspe726b4a2022-03-28 05:47:45 +0200527
bansp5f841732022-03-16 06:27:31 +0100528 <!-- ************************** TEXT header ******************* -->
529
530 <xsl:template name="create_text_header">
531 <xsl:param name="text.xml" as="document-node()"/>
banspe726b4a2022-03-28 05:47:45 +0200532 <xsl:param name="compoundID" as="xs:string"/>
bansp5f841732022-03-16 06:27:31 +0100533 <xsl:param name="target" as="xs:string"/>
534
535 <!-- create the local header.xml file -->
536 <xsl:result-document encoding="UTF-8" method="xml" indent="yes"
537 xpath-default-namespace="http://ids-mannheim.de/ns/KorAP" href="{$target}">
538
539 <idsHeader type="document" pattern="text" status="new" version="1.1" TEIform="teiHeader">
banspe726b4a2022-03-28 05:47:45 +0200540 <xsl:apply-templates select="$text.xml//tei:TEI/tei:teiHeader/tei:*" mode="text">
541 <xsl:with-param name="compoundID" as="xs:string" select="$compoundID" tunnel="yes"/>
542 </xsl:apply-templates>
bansp5f841732022-03-16 06:27:31 +0100543 </idsHeader>
544 </xsl:result-document>
545 </xsl:template>
546
547 <xsl:template match="tei:fileDesc" mode="text">
bansp9103aab2022-03-19 05:10:21 +0100548 <xsl:element name="{local-name()}">
bansp5f841732022-03-16 06:27:31 +0100549 <xsl:apply-templates mode="text"/>
bansp9103aab2022-03-19 05:10:21 +0100550 </xsl:element>
bansp5f841732022-03-16 06:27:31 +0100551 </xsl:template>
552
553 <xsl:template match="tei:title" mode="text">
554 <t.title>
555 <xsl:apply-templates/>
556 </t.title>
557 </xsl:template>
558
559 <xsl:template match="tei:titleStmt" mode="text">
banspe726b4a2022-03-28 05:47:45 +0200560 <xsl:param name="compoundID" as="xs:string" tunnel="yes"/>
bansp5f841732022-03-16 06:27:31 +0100561 <titleStmt>
562 <textSigle>
banspe726b4a2022-03-28 05:47:45 +0200563 <xsl:value-of select="$compoundID"/>
bansp5f841732022-03-16 06:27:31 +0100564 </textSigle>
565 <xsl:apply-templates mode="text"/>
566 </titleStmt>
567 </xsl:template>
568
bansp9103aab2022-03-19 05:10:21 +0100569 <xsl:template match="tei:publicationStmt" mode="text">
570 <xsl:element name="{local-name()}">
571 <xsl:apply-templates mode="text"/>
572 </xsl:element>
573 </xsl:template>
574
575 <xsl:template match="tei:availability" mode="text">
576 <xsl:element name="{local-name()}">
577 <xsl:apply-templates mode="text" select="@* | *"/>
578 </xsl:element>
579 </xsl:template>
580
581 <xsl:template match="tei:profileDesc" mode="text">
582 <xsl:element name="{local-name()}">
583 <xsl:apply-templates mode="text"/>
584 </xsl:element>
585 </xsl:template>
bansp5f841732022-03-16 06:27:31 +0100586
bansp9103aab2022-03-19 05:10:21 +0100587 <xsl:template match="tei:textClass" mode="text">
588 <xsl:element name="{local-name()}">
589 <xsl:apply-templates mode="text" select="@* | *"/>
590 </xsl:element>
591 </xsl:template>
592
593 <xsl:template match="tei:catRef" mode="text corpus">
594 <xsl:element name="{local-name()}">
595 <xsl:apply-templates mode="text" select="@* | *"/>
596 </xsl:element>
597 </xsl:template>
598
599 <xsl:template match="@status | @scheme | @target | @type | @xml:id[ancestor::tei:classDecl] | @xml:lang" mode="text corpus">
600 <xsl:copy-of select="."/>
601 </xsl:template>
602
603 <xsl:template match="tei:p" mode="text corpus">
604 <xsl:element name="{local-name()}">
605 <xsl:apply-templates mode="header-text"/>
606 </xsl:element>
607 </xsl:template>
608
609
610 <!-- OPTIMIZATION has to take modes into account -->
bansp5e2d1c02022-03-10 04:51:40 +0100611 <!-- ************************** CORPUS header ******************* -->
612 <xsl:template name="create_corpus_header">
613 <xsl:param name="text.xml" as="document-node()"/>
bansp5f841732022-03-16 06:27:31 +0100614 <xsl:param name="target" as="xs:string"/>
bansp5e2d1c02022-03-10 04:51:40 +0100615
616 <!-- create the corpus-level header.xml file -->
bansp5f841732022-03-16 06:27:31 +0100617 <xsl:result-document encoding="UTF-8" method="xml" indent="yes" href="{$target}">
bansp5e2d1c02022-03-10 04:51:40 +0100618
619 <!--doctype-public="{$publicDoctypeI5}"
620 doctype-system="{$systemDoctypeI5}">
621 these are, sadly, useless
622 -->
623
624 <idsHeader type="corpus" pattern="text" status="new" version="1.1" TEIform="teiHeader">
bansp9103aab2022-03-19 05:10:21 +0100625 <xsl:apply-templates select="$text.xml/tei:teiCorpus/tei:teiHeader/tei:*" mode="corpus"/>
bansp5e2d1c02022-03-10 04:51:40 +0100626 </idsHeader>
627 </xsl:result-document>
628 </xsl:template>
629
630 <xsl:template match="tei:fileDesc" mode="corpus">
bansp9103aab2022-03-19 05:10:21 +0100631 <xsl:element name="{local-name()}">
bansp5e2d1c02022-03-10 04:51:40 +0100632 <xsl:apply-templates mode="corpus"/>
bansp9103aab2022-03-19 05:10:21 +0100633 </xsl:element>
bansp5e2d1c02022-03-10 04:51:40 +0100634 </xsl:template>
bansp9103aab2022-03-19 05:10:21 +0100635
bansp5e2d1c02022-03-10 04:51:40 +0100636
637 <xsl:template match="tei:title" mode="corpus">
638 <c.title>
bansp9103aab2022-03-19 05:10:21 +0100639 <xsl:apply-templates mode="corpus" select="@*"/>
640 <xsl:apply-templates mode="header-text"/>
bansp5e2d1c02022-03-10 04:51:40 +0100641 </c.title>
642 </xsl:template>
643
644 <xsl:template match="tei:titleStmt" mode="corpus">
645 <titleStmt>
646 <korpusSigle>
647 <xsl:value-of select="$corpusID"/>
648 </korpusSigle>
649 <xsl:apply-templates mode="corpus"/>
650 </titleStmt>
651 </xsl:template>
652
bansp9103aab2022-03-19 05:10:21 +0100653 <xsl:template match="tei:publicationStmt" mode="corpus">
654 <xsl:element name="{local-name()}">
655 <xsl:apply-templates mode="corpus"/>
656 </xsl:element>
657 </xsl:template>
658
659 <xsl:template match="tei:availability" mode="corpus">
660 <xsl:element name="{local-name()}">
661 <xsl:apply-templates mode="corpus" select="@* | *"/>
662 </xsl:element>
663 </xsl:template>
664
665 <xsl:template match="tei:encodingDesc" mode="corpus">
666 <xsl:element name="{local-name()}">
667 <xsl:apply-templates mode="corpus"/>
668 </xsl:element>
669 </xsl:template>
670
671 <xsl:template match="tei:classDecl | tei:taxonomy | tei:category | tei:taxonomy/tei:bibl" mode="corpus">
672 <xsl:element name="{local-name()}">
673 <xsl:apply-templates mode="corpus" select="@* | *"/>
674 </xsl:element>
675 </xsl:template>
676
677 <xsl:template match="tei:bibl/tei:title | tei:edition | tei:desc" mode="corpus">
678 <xsl:element name="{local-name()}">
679 <xsl:apply-templates mode="corpus" select="@*"/>
680 <xsl:apply-templates mode="header-text"/>
681 </xsl:element>
682 </xsl:template>
683<!--
684 <xsl:template match="tei:textClass" mode="corpus">
685 <xsl:element name="{local-name()}">
686 <xsl:apply-templates mode="corpus" select="@* | *"/>
687 </xsl:element>
688 </xsl:template>
689
690 <xsl:template match="tei:catRef" mode="corpus">
691 <xsl:element name="{local-name()}">
692 <xsl:apply-templates mode="corpus" select="@* | *"/>
693 </xsl:element>
694 </xsl:template>
695-->
bansp5e2d1c02022-03-10 04:51:40 +0100696
697
698
699 <!-- this template can be called by the XSPEC test; TODO: find a way to call the main() template directly -->
700 <!-- I have not fully handled the param transmission, which would have to be kludged in just for the sake of XSPec,
701 because I'm disabling this for now, due to XSpec design issues; relevant links, a.o.:
702
703 https://stackoverflow.com/questions/64933277/what-is-the-cause-of-error-cannot-execute-xslresult-document-while-evaluating
704 https://www.balisage.net/Proceedings/vol25/html/Galtman01/BalisageVol25-Galtman01.html
705
706 In short: the internal design of XSpec forces kludges when one wants to use xsl:result-document in their stylesheets. But I don't
707 want to be strangled by kludges at the beginning of work, I've already lost quite a bit of time on this investigation,
708 I will therefore "just code" and then can think of externalizing bits of templates if we want to play with tests. For now,
709 I don't want to have to handle context items is a special way inside variables, etc., because I'm not sure it's worth it.
710
711 -->
712 <!--<xsl:template name="test_full">
713 <xsl:param name="corpusID"/>
714 <xsl:param name="docID"/>
715 <xsl:param name="textID"/>
716 <xsl:call-template name="xsl:initial-template"/>
717 </xsl:template>-->
718
Akron9a8ee3e2022-01-31 13:51:49 +0100719</xsl:stylesheet>