blob: b4909ba197fada702887856cf65210b7a1fb5da0 [file] [log] [blame]
Akron9a8ee3e2022-01-31 13:51:49 +01001<?xml version="1.0" encoding="UTF-8"?>
2<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
bansp5e2d1c02022-03-10 04:51:40 +01003 xmlns:xs="http://www.w3.org/2001/XMLSchema" xmlns:nkjp="http://www.nkjp.pl/ns/1.0"
4 xmlns:tei="http://www.tei-c.org/ns/1.0" xmlns:f="func"
5 xmlns:map="http://www.w3.org/2005/xpath-functions/map" exclude-result-prefixes="xs f map nkjp tei"
6 version="3.0" expand-text="yes">
Akron9a8ee3e2022-01-31 13:51:49 +01007
banspe726b4a2022-03-28 05:47:45 +02008
9<!-- PARAMETERS -->
bansp5e2d1c02022-03-10 04:51:40 +010010
bansp8f6700b2022-03-27 05:27:09 +020011 <xsl:param name="sourceDir" select="'test/resources/nkjp2korap_sample2'" as="xs:string"/>
12 <!-- the directory containing NKJP files, in the form of a collection of text-level dirs -->
Akron9a8ee3e2022-01-31 13:51:49 +010013
bansp8f6700b2022-03-27 05:27:09 +020014 <xsl:param name="targetDir" select="'test/output'" as="xs:string"/>
banspf2b24e62022-03-28 18:12:08 +020015
16 <xsl:param name="skip_docID" as="xs:string">
banspb5992532022-03-29 15:55:44 +020017 <xsl:value-of select="'HellerPodgladanie,IsakowiczZaleskiMoje,KolakowskiOco,MysliwskiKamien,WilkWilczy,ZycieWarszawy_Zycie'"/>
18 </xsl:param>
19 <!-- comma-separated list of document IDs to be skipped from processing
banspf2b24e62022-03-28 18:12:08 +020020 example: HellerPodgladanie,KOT
21 no functionality beyond string identity is supported -->
banspb5992532022-03-29 15:55:44 +020022
bansp8f6700b2022-03-27 05:27:09 +020023
banspe726b4a2022-03-28 05:47:45 +020024<!-- VARIABLES -->
25
26 <xsl:variable name="corpusID" as="xs:string" select="'NKJP'" static="yes"/>
27 <xsl:variable name="docID" as="xs:string" select="'NKJP'" static="yes"/>
bansp8f6700b2022-03-27 05:27:09 +020028
29 <xsl:variable name="targetCorpusDir_slashed" select="$targetDir || '/' || $corpusID || '/'" as="xs:string"/>
bansp5e2d1c02022-03-10 04:51:40 +010030
31 <xsl:variable name="systemDoctypeI5"
bansp97ba7ce2022-03-26 05:14:06 +010032 select="'http://corpora.ids-mannheim.de/I5/DTD/i5.dtd'" as="xs:string"
bansp5e2d1c02022-03-10 04:51:40 +010033 static="true"/>
34
bansp97ba7ce2022-03-26 05:14:06 +010035 <xsl:variable name="publicDoctypeI5" select="'-//IDS//DTD I5 1.0//EN'" as="xs:string"
bansp5e2d1c02022-03-10 04:51:40 +010036 static="true"/>
37
38 <xsl:variable name="KorAP_namespace" select="'http://ids-mannheim.de/ns/KorAP'" static="true"
39 as="xs:string"/>
40
bansp5f841732022-03-16 06:27:31 +010041 <xsl:variable name="KorAP-XML_version" select="'KorAP-0.4'" as="xs:string" static="true"/>
42 <!-- this is only a bit funny -->
43
banspe726b4a2022-03-28 05:47:45 +020044 <xsl:variable name="collection_params" as="xs:string" static="yes"
45 select="'recurse=yes;validation=strip;select=text.xml;content-type=application/xml;on-error=warning;xinclude=yes'"
46 />
47 <!-- see https://www.saxonica.com/documentation11/index.html#!sourcedocs/collections/collection-directories -->
48
49 <xsl:variable name="collection_of_text" select="collection($sourceDir || '?' || $collection_params)" as="document-node()+"/>
banspb5992532022-03-29 15:55:44 +020050
51
banspe726b4a2022-03-28 05:47:45 +020052<!-- MODES -->
bansp5e2d1c02022-03-10 04:51:40 +010053
54 <xsl:mode name="corpus" on-no-match="deep-skip"/>
55 <xsl:mode name="text" on-no-match="deep-skip"/>
bansp9103aab2022-03-19 05:10:21 +010056 <xsl:mode name="header-text" on-no-match="text-only-copy"/>
bansp5e2d1c02022-03-10 04:51:40 +010057
banspe726b4a2022-03-28 05:47:45 +020058
59 <!-- FUNCTIONS -->
60
bansp5f841732022-03-16 06:27:31 +010061 <xsl:function name="f:compute_nesting" as="xs:integer">
62 <xsl:param name="node" as="node()"/>
63 <xsl:variable name="rel_depth"
64 select="count($node/ancestor-or-self::*[local-name(.) ne 'TEI'][local-name(.) ne 'teiCorpus'])"
65 as="xs:integer"/>
bansp5f841732022-03-16 06:27:31 +010066 <xsl:sequence select="$rel_depth"/>
67 </xsl:function>
68
69 <xsl:function name="f:calc_content_length" as="xs:integer">
70 <xsl:param name="node" as="node()"/>
71 <xsl:choose>
72 <xsl:when test="$node/self::tei:text or $node/self::tei:body">
73 <xsl:variable name="last_corresp"
74 select="$node/descendant::tei:p[last()]/descendant::tei:s[last()]/descendant::tei:seg[last()]/attribute::corresp"
75 as="attribute(corresp)"/>
76 <xsl:variable name="numbers" select="substring-after(substring-before($last_corresp,')'),',')"/>
77 <xsl:sequence select="xs:integer(substring-before($numbers,',')) + xs:integer(substring-after($numbers,','))"/>
78 </xsl:when>
79 <xsl:when test="$node/self::tei:p">
80 <xsl:variable name="last_corresp"
81 select="$node/descendant::tei:s[last()]/descendant::tei:seg[last()]/attribute::corresp"
82 as="attribute(corresp)"/>
83 <xsl:variable name="numbers" select="substring-after(substring-before($last_corresp,')'),',')"/>
84 <xsl:sequence select="xs:integer(substring-before($numbers,',')) + xs:integer(substring-after($numbers,','))"/>
85 </xsl:when>
86 <xsl:when test="$node/self::tei:s">
87 <xsl:variable name="last_corresp"
88 select="$node/descendant::tei:seg[last()]/attribute::corresp"
89 as="attribute(corresp)"/>
90 <xsl:variable name="numbers" select="substring-after(substring-before($last_corresp,')'),',')"/>
91 <xsl:sequence select="xs:integer(substring-before($numbers,',')) + xs:integer(substring-after($numbers,','))"/>
92 </xsl:when>
93 <xsl:otherwise>
94 <xsl:variable name="numbers" select="substring-after(substring-before($node/@corresp,')'),',')"/>
95 <xsl:sequence select="xs:integer(substring-before($numbers,',')) + xs:integer(substring-after($numbers,','))"/>
96 </xsl:otherwise>
97 </xsl:choose>
98 </xsl:function>
bansp5e2d1c02022-03-10 04:51:40 +010099
banspb5992532022-03-29 15:55:44 +0200100
101<!-- UTILITY TEMPLATES -->
102
bansp9103aab2022-03-19 05:10:21 +0100103 <xsl:template match="@default" mode="#all"/>
bansp97ba7ce2022-03-26 05:14:06 +0100104 <!-- this is to delete some auto-inserted attribute throughout -->
bansp9103aab2022-03-19 05:10:21 +0100105
banspe726b4a2022-03-28 05:47:45 +0200106 <xsl:template match="tei:w" mode="#all"/>
107<!-- NKJP-SGJP has apparently resigned from standoff representations by adding <w> everywhere;
108 for the time being, we'll just stick to the standoff offsets, although that may need to
109 be revisited as the NKJP format has now began to stray from its schemas and assumptions -->
bansp8f6700b2022-03-27 05:27:09 +0200110
banspe726b4a2022-03-28 05:47:45 +0200111 <xsl:template match="tei:choice" mode="#all"/>
112<!-- THIS IS ONLY TEMPORARY,
113 because an interesting challenge came up where I will
114 probably have to abandon straightforward mapping because of TOKENIZATION alternatives;
115
116 but now, I just want this stylesheet to work, even if it eats some occasional token (which it now does, 'komuÅ›' and 'czym' vanish)
117 -->
bansp8f6700b2022-03-27 05:27:09 +0200118
banspb5992532022-03-29 15:55:44 +0200119
120 <!-- MAIN PROCESSING -->
121
122
bansp5e2d1c02022-03-10 04:51:40 +0100123 <xsl:template name="xsl:initial-template">
banspf2b24e62022-03-28 18:12:08 +0200124 <xsl:variable name="IDs_to_skip" select="tokenize($skip_docID,',')" as="xs:string*"/>
125
banspe726b4a2022-03-28 05:47:45 +0200126 <!-- we only want to call the template below once, and we process a random NKJP corpus file for that purpose,
bansp8f6700b2022-03-27 05:27:09 +0200127 because all we need is the main corpus header, and we can (should) get to that from any NKJP corpus document -->
128 <xsl:call-template name="create_corpus_header">
banspe726b4a2022-03-28 05:47:45 +0200129 <xsl:with-param name="text.xml" select="$collection_of_text[1]" as="document-node()"/>
bansp8f6700b2022-03-27 05:27:09 +0200130 <xsl:with-param name="target" select="$targetCorpusDir_slashed || 'header.xml'" as="xs:string"/>
131 </xsl:call-template>
132
banspe726b4a2022-03-28 05:47:45 +0200133 <xsl:for-each select="$collection_of_text">
134 <xsl:variable name="my_dir" as="xs:string" select="replace(base-uri(),'/text\.xml','')"/>
135 <xsl:variable name="my_textID" as="xs:string" select="tokenize($my_dir,'/')[last()]"/>
136 <xsl:variable name="ann_morphosyntax.uri" select="$my_dir || '/ann_morphosyntax.xml'" as="xs:string"/>
137 <xsl:variable name="ann_segmentation.uri" select="$my_dir || '/ann_segmentation.xml'" as="xs:string"/>
138
banspf2b24e62022-03-28 18:12:08 +0200139 <xsl:choose>
140 <xsl:when test="$my_textID = $IDs_to_skip"/>
141 <xsl:otherwise>
banspb5992532022-03-29 15:55:44 +0200142 <xsl:call-template name="process_single_sample">
banspf2b24e62022-03-28 18:12:08 +0200143 <xsl:with-param name="text.xml" as="document-node()" select="."/>
144 <xsl:with-param name="ann_morphosyntax.xml" as="document-node()"
145 select="doc($ann_morphosyntax.uri)"/>
146 <xsl:with-param name="ann_segmentation.xml" as="document-node()"
147 select="doc($ann_segmentation.uri)"/>
148 <xsl:with-param name="my_textID" select="$my_textID" as="xs:string"/>
banspb5992532022-03-29 15:55:44 +0200149 </xsl:call-template>
banspf2b24e62022-03-28 18:12:08 +0200150 </xsl:otherwise>
151 </xsl:choose>
banspe726b4a2022-03-28 05:47:45 +0200152 </xsl:for-each>
bansp8f6700b2022-03-27 05:27:09 +0200153 </xsl:template>
154
155 <xsl:template name="process_single_sample">
banspe726b4a2022-03-28 05:47:45 +0200156 <xsl:param name="text.xml" as="document-node()"/>
157 <xsl:param name="ann_morphosyntax.xml" as="document-node()"/>
158 <xsl:param name="ann_segmentation.xml" as="document-node()"/>
159 <xsl:param name="my_textID" as="xs:string" select="'0BAD_textID'"/>
160
161 <xsl:variable name="targetBaseDir" as="xs:string" select="$targetCorpusDir_slashed || $docID || '/' || $my_textID"/>
162
163 <xsl:variable name="compoundID" as="xs:string"
164 select="$corpusID || '_' || $docID || '.' || $my_textID"/>
165 <!-- this is what occurs in the text and data layers as @docid -->
166
167
bansp5e2d1c02022-03-10 04:51:40 +0100168 <xsl:call-template name="create_data">
169 <xsl:with-param name="text.xml" select="$text.xml" as="document-node()"/>
banspe726b4a2022-03-28 05:47:45 +0200170 <xsl:with-param name="compoundID" select="$compoundID" as="xs:string"/>
171 <xsl:with-param name="target" select="$targetBaseDir || '/data.xml'" as="xs:string"/>
bansp5f841732022-03-16 06:27:31 +0100172 </xsl:call-template>
173
174 <xsl:call-template name="create_struct">
175 <xsl:with-param name="text.xml" select="$text.xml" as="document-node()"/>
banspe726b4a2022-03-28 05:47:45 +0200176 <xsl:with-param name="compoundID" select="$compoundID" as="xs:string"/>
bansp5f841732022-03-16 06:27:31 +0100177 <xsl:with-param name="ann_segmentation.xml" select="$ann_segmentation.xml"
178 as="document-node()"/>
banspe726b4a2022-03-28 05:47:45 +0200179 <xsl:with-param name="target" select="$targetBaseDir || '/struct/structure.xml'" as="xs:string"
bansp5f841732022-03-16 06:27:31 +0100180 />
181 </xsl:call-template>
182
183 <xsl:call-template name="create_morpho">
184 <xsl:with-param name="text.xml" select="$text.xml" as="document-node()"/>
banspe726b4a2022-03-28 05:47:45 +0200185 <xsl:with-param name="compoundID" select="$compoundID" as="xs:string"/>
bansp3e5b20c2022-03-18 20:22:31 +0100186 <xsl:with-param name="ann_segmentation.xml" select="$ann_segmentation.xml"
187 as="document-node()"/>
bansp5f841732022-03-16 06:27:31 +0100188 <xsl:with-param name="ann_morphosyntax.xml" select="$ann_morphosyntax.xml"
189 as="document-node()"/>
banspe726b4a2022-03-28 05:47:45 +0200190 <xsl:with-param name="target" select="$targetBaseDir || '/nkjp/morpho.xml'" as="xs:string"/>
bansp5e2d1c02022-03-10 04:51:40 +0100191 </xsl:call-template>
192
193 <xsl:call-template name="create_text_header">
194 <xsl:with-param name="text.xml" select="$text.xml" as="document-node()"/>
banspe726b4a2022-03-28 05:47:45 +0200195 <xsl:with-param name="compoundID" select="$compoundID" as="xs:string"/>
196 <xsl:with-param name="target" select="$targetBaseDir || '/header.xml'" as="xs:string"/>
bansp5e2d1c02022-03-10 04:51:40 +0100197 </xsl:call-template>
198
bansp5e2d1c02022-03-10 04:51:40 +0100199 </xsl:template>
200
201 <!-- ************************** data.xml ******************* -->
202
203 <xsl:template name="create_data">
204 <xsl:param name="text.xml" as="document-node()"/>
banspe726b4a2022-03-28 05:47:45 +0200205 <xsl:param name="compoundID" as="xs:string"/>
bansp5f841732022-03-16 06:27:31 +0100206 <xsl:param name="target" as="xs:string"/>
bansp5e2d1c02022-03-10 04:51:40 +0100207 <!-- create the data.xml file -->
208 <xsl:result-document encoding="UTF-8" method="xml" indent="yes"
bansp5f841732022-03-16 06:27:31 +0100209 xpath-default-namespace="{$KorAP_namespace}" href="{$target}">
bansp5e2d1c02022-03-10 04:51:40 +0100210
Akron9a8ee3e2022-01-31 13:51:49 +0100211 <xsl:processing-instruction name="xml-model">href=&quot;text.rng&quot; type=&quot;application/xml&quot; schematypens=&quot;http://relaxng.org/ns/structure/1.0&quot;</xsl:processing-instruction>
bansp5e2d1c02022-03-10 04:51:40 +0100212 <xsl:element name="raw_text" namespace="{$KorAP_namespace}">
bansp5f841732022-03-16 06:27:31 +0100213 <xsl:attribute name="docid" select="$compoundID"/>
bansp5e2d1c02022-03-10 04:51:40 +0100214 <xsl:element name="metadata" namespace="{$KorAP_namespace}">
215 <xsl:attribute name="file" select="'metadata.xml'"/>
216 </xsl:element>
217
218 <xsl:element name="text" namespace="{$KorAP_namespace}">
banspf79443e2022-02-25 14:25:33 +0100219 <xsl:value-of select="$text.xml//*[local-name() = 'ab']"/>
bansp5e2d1c02022-03-10 04:51:40 +0100220 </xsl:element>
Akron9a8ee3e2022-01-31 13:51:49 +0100221 </xsl:element>
banspf79443e2022-02-25 14:25:33 +0100222 </xsl:result-document>
Akron9a8ee3e2022-01-31 13:51:49 +0100223 </xsl:template>
224
bansp5f841732022-03-16 06:27:31 +0100225 <!-- ************************** struct ******************* -->
226
227 <xsl:template name="create_struct">
228 <xsl:param name="text.xml" as="document-node()"/>
banspe726b4a2022-03-28 05:47:45 +0200229 <xsl:param name="compoundID" as="xs:string"/>
bansp5f841732022-03-16 06:27:31 +0100230 <xsl:param name="ann_segmentation.xml" as="document-node()"/>
231 <xsl:param name="target" as="xs:string"/>
232
233 <xsl:result-document encoding="UTF-8" method="xml" indent="yes"
234 xpath-default-namespace="http://ids-mannheim.de/ns/KorAP" href="{$target}">
235 <xsl:processing-instruction name="xml-model">href=&quot;span.rng&quot; type=&quot;application/xml&quot; schematypens=&quot;http://relaxng.org/ns/structure/1.0&quot;</xsl:processing-instruction>
236 <xsl:element name="layer" namespace="{$KorAP_namespace}">
237 <xsl:attribute name="docid" select="$compoundID"/>
238 <xsl:attribute name="version" select="$KorAP-XML_version"/>
239
240 <xsl:element name="spanList" namespace="{$KorAP_namespace}">
241 <xsl:apply-templates select="$ann_segmentation.xml//tei:text" mode="struct"/>
242 </xsl:element>
243 </xsl:element>
244 </xsl:result-document>
245 </xsl:template>
246
247 <xsl:template match="tei:*" mode="struct">
248 <xsl:param name="ini" as="xs:integer" required="no" select="0"/>
249 <xsl:param name="fin" as="xs:integer" required="no" select="999999999"/>
250 <xsl:param name="index" as="xs:integer" required="no" select="1"/>
251 <!-- I have made a major mess here, but it works... it's so spread out
252 because I wanted to make sure to be able to look up the individual
bansp3e5b20c2022-03-18 20:22:31 +0100253 constituent values, should anything go wrong; optimization will come when it's worked against a larger dataset -->
bansp5f841732022-03-16 06:27:31 +0100254 <xsl:variable name="my_name" select="local-name()" as="xs:string"/>
255 <xsl:variable name="preceding" select="preceding-sibling::*[local-name(.) eq $my_name]"/>
256 <xsl:variable name="preceding-count" select="count($preceding)"/>
257 <xsl:variable name="outside-preceding-count" as="xs:integer">
258 <xsl:choose>
259 <xsl:when test="self::tei:s or self::tei:p">
260 <xsl:choose>
261 <xsl:when test="$preceding-count">
262 <xsl:sequence select="
263 sum(for $p in $preceding
264 return
265 count($p/descendant::*))"/>
266 </xsl:when>
267 <xsl:otherwise>
268 <xsl:sequence select="0"/>
269 </xsl:otherwise>
270 </xsl:choose>
271 </xsl:when>
272 <xsl:otherwise>
273 <xsl:sequence select="0"/>
274 </xsl:otherwise>
275 </xsl:choose>
276 </xsl:variable>
277 <xsl:variable name="my_index" select="$index + 1 + $preceding-count + $outside-preceding-count"
278 as="xs:integer"/>
banspb5992532022-03-29 15:55:44 +0200279
280
281 <!--<xsl:copy select="//tei:seg[count(@nkjp:rejected) ne 0 and @nkjp:rejected ne 'true']"></xsl:copy>-->
bansp5f841732022-03-16 06:27:31 +0100282
283 <xsl:variable name="start" as="xs:integer">
284 <xsl:choose>
285 <xsl:when test="self::tei:text or self::tei:body">
286 <xsl:sequence select="0"/>
287 </xsl:when>
288 <xsl:when test="self::tei:p">
289 <xsl:variable name="first_corresp"
290 select="descendant::tei:s[1]/descendant::tei:seg[1]/attribute::corresp"
291 as="attribute(corresp)"/>
292 <xsl:variable name="numbers" select="substring-after(substring-before($first_corresp,')'),',')"/>
293 <xsl:sequence select="xs:integer(substring-before($numbers,','))"/>
294 </xsl:when>
295 <xsl:when test="self::tei:s">
296 <xsl:variable name="first_corresp"
297 select="descendant::tei:seg[1]/attribute::corresp"
298 as="attribute(corresp)"/>
299 <xsl:variable name="numbers" select="substring-after(substring-before($first_corresp,')'),',')"/>
300 <xsl:sequence select="xs:integer(substring-before($numbers,','))"/>
301 </xsl:when>
302 <xsl:when test="self::tei:seg">
303 <xsl:variable name="numbers" select="substring-after(substring-before(@corresp,')'),',')"/>
304 <xsl:sequence select="xs:integer(substring-before($numbers,','))"/>
305 </xsl:when>
306 </xsl:choose>
307 </xsl:variable>
308 <xsl:variable name="end" as="xs:integer" select="f:calc_content_length(.)">
309 </xsl:variable>
bansp3e5b20c2022-03-18 20:22:31 +0100310
bansp5f841732022-03-16 06:27:31 +0100311 <xsl:element name="span" namespace="{$KorAP_namespace}">
312 <xsl:attribute name="id" select="'s' || $my_index"/>
313 <xsl:attribute name="from" select="$start"/>
314 <xsl:attribute name="to" select="$end"/>
315 <xsl:attribute name="l" select="f:compute_nesting(.)"/>
316 <xsl:element name="fs" namespace="http://www.tei-c.org/ns/1.0">
bansp3e5b20c2022-03-18 20:22:31 +0100317 <xsl:attribute name="type" select="'struct'"></xsl:attribute> <!-- STRUCT vs. LEX -->
bansp5f841732022-03-16 06:27:31 +0100318 <xsl:element name="f" namespace="http://www.tei-c.org/ns/1.0">
bansp3e5b20c2022-03-18 20:22:31 +0100319 <xsl:attribute name="name" select="'name'"/>
320 <xsl:value-of select="local-name()"/>
bansp5f841732022-03-16 06:27:31 +0100321 </xsl:element>
322 <xsl:if test="count(@*)">
323 <xsl:element name="f" namespace="http://www.tei-c.org/ns/1.0">
324 <xsl:attribute name="name" select="'attr'"/>
325 <xsl:element name="fs" namespace="http://www.tei-c.org/ns/1.0">
326 <xsl:attribute name="type" select="'attr'"/>
327 <xsl:for-each select="@*">
328 <xsl:element name="f" namespace="http://www.tei-c.org/ns/1.0">
329 <xsl:attribute name="name" select="local-name(.)"/>
330 <xsl:value-of select="."/>
331 </xsl:element>
332 </xsl:for-each>
333 </xsl:element>
334 </xsl:element>
335 </xsl:if>
336 </xsl:element>
337 </xsl:element>
338 <xsl:apply-templates mode="struct">
339 <xsl:with-param name="ini" select="$start" as="xs:integer"/>
340 <xsl:with-param name="fin" select="$end" as="xs:integer"/>
341 <xsl:with-param name="index" select="$my_index"/>
342 </xsl:apply-templates>
343 </xsl:template>
344
345 <!-- ************************** morpho ******************* -->
346
347 <xsl:template name="create_morpho">
348 <xsl:param name="text.xml" as="document-node()"/>
banspe726b4a2022-03-28 05:47:45 +0200349 <xsl:param name="compoundID" as="xs:string"/>
bansp3e5b20c2022-03-18 20:22:31 +0100350 <xsl:param name="ann_segmentation.xml" as="document-node()"/>
bansp5f841732022-03-16 06:27:31 +0100351 <xsl:param name="ann_morphosyntax.xml" as="document-node()"/>
352 <xsl:param name="target" as="xs:string"/>
353
354 <xsl:result-document encoding="UTF-8" method="xml" indent="yes"
355 xpath-default-namespace="http://ids-mannheim.de/ns/KorAP" href="{$target}">
356 <xsl:processing-instruction name="xml-model">href=&quot;span.rng&quot; type=&quot;application/xml&quot; schematypens=&quot;http://relaxng.org/ns/structure/1.0&quot;</xsl:processing-instruction>
bansp3e5b20c2022-03-18 20:22:31 +0100357 <xsl:element name="layer" namespace="{$KorAP_namespace}">
358 <xsl:attribute name="docid" select="$compoundID"/>
359 <xsl:attribute name="version" select="$KorAP-XML_version"/>
360
361 <xsl:element name="spanList" namespace="{$KorAP_namespace}">
362 <xsl:apply-templates select="$ann_segmentation.xml//tei:text" mode="morpho">
363 <xsl:with-param name="ann_morphosyntax.xml" select="$ann_morphosyntax.xml" as="document-node()"/>
364 </xsl:apply-templates>
365 </xsl:element>
366 </xsl:element>
bansp5f841732022-03-16 06:27:31 +0100367 </xsl:result-document>
368 </xsl:template>
369
bansp3e5b20c2022-03-18 20:22:31 +0100370 <xsl:template match="tei:*" mode="morpho">
371 <xsl:param name="ini" as="xs:integer" required="no" select="0"/>
372 <xsl:param name="fin" as="xs:integer" required="no" select="999999999"/>
373 <xsl:param name="index" as="xs:integer" required="no" select="1"/>
374 <xsl:param name="ann_morphosyntax.xml" as="document-node()"/>
375 <xsl:variable name="my_name" select="local-name()" as="xs:string"/>
376 <xsl:variable name="preceding" select="preceding-sibling::*[local-name(.) eq $my_name]"/>
377 <xsl:variable name="preceding-count" select="count($preceding)"/>
378 <xsl:variable name="outside-preceding-count" as="xs:integer">
379 <xsl:choose>
380 <xsl:when test="self::tei:s or self::tei:p">
381 <xsl:choose>
382 <xsl:when test="$preceding-count">
383 <xsl:sequence select="
384 sum(for $p in $preceding
385 return
386 count($p/descendant::*))"/>
387 </xsl:when>
388 <xsl:otherwise>
389 <xsl:sequence select="0"/>
390 </xsl:otherwise>
391 </xsl:choose>
392 </xsl:when>
393 <xsl:otherwise>
394 <xsl:sequence select="0"/>
395 </xsl:otherwise>
396 </xsl:choose>
397 </xsl:variable>
398 <xsl:variable name="my_index" select="$index + 1 + $preceding-count + $outside-preceding-count"
399 as="xs:integer"/>
400
401 <xsl:variable name="start" as="xs:integer">
402 <xsl:choose>
403 <xsl:when test="self::tei:text or self::tei:body">
404 <xsl:sequence select="0"/>
405 </xsl:when>
406 <xsl:when test="self::tei:p">
407 <xsl:variable name="first_corresp"
408 select="descendant::tei:s[1]/descendant::tei:seg[1]/attribute::corresp"
409 as="attribute(corresp)"/>
410 <xsl:variable name="numbers" select="substring-after(substring-before($first_corresp,')'),',')"/>
411 <xsl:sequence select="xs:integer(substring-before($numbers,','))"/>
412 </xsl:when>
413 <xsl:when test="self::tei:s">
414 <xsl:variable name="first_corresp"
415 select="descendant::tei:seg[1]/attribute::corresp"
416 as="attribute(corresp)"/>
417 <xsl:variable name="numbers" select="substring-after(substring-before($first_corresp,')'),',')"/>
418 <xsl:sequence select="xs:integer(substring-before($numbers,','))"/>
419 </xsl:when>
420 <!--<xsl:when test="self::tei:seg">
421 <xsl:variable name="numbers" select="substring-after(substring-before(@corresp,')'),',')"/>
422 <xsl:sequence select="xs:integer(substring-before($numbers,','))"/>
423 </xsl:when>-->
424 </xsl:choose>
425 </xsl:variable>
426 <xsl:variable name="end" as="xs:integer" select="f:calc_content_length(.)">
427 </xsl:variable>
428
429 <xsl:apply-templates mode="morpho">
430 <xsl:with-param name="ini" select="$start" as="xs:integer"/>
431 <xsl:with-param name="fin" select="$end" as="xs:integer"/>
432 <xsl:with-param name="index" select="$my_index"/>
433 <xsl:with-param name="ann_morphosyntax.xml" select="$ann_morphosyntax.xml" as="document-node()"/>
434 </xsl:apply-templates>
435 </xsl:template>
bansp5f841732022-03-16 06:27:31 +0100436
bansp3e5b20c2022-03-18 20:22:31 +0100437 <xsl:template match="tei:seg" mode="morpho">
438 <xsl:param name="ini" as="xs:integer" required="no" select="0"/>
439 <xsl:param name="fin" as="xs:integer" required="no" select="999999999"/>
440 <xsl:param name="index" as="xs:integer" required="no" select="1"/>
441 <xsl:param name="ann_morphosyntax.xml" as="document-node()"/>
442 <!-- I have made a major mess here, but it works... it's so spread out
443 because I wanted to make sure to be able to look up the individual
444 constituent values, should anything go wrong -->
445 <xsl:variable name="my_name" select="local-name()" as="xs:string"/>
446 <xsl:variable name="my_id" select="@xml:id" as="xs:string"/>
447 <xsl:variable name="my_morph-seg" as="node()" select="$ann_morphosyntax.xml//tei:seg[substring-after(@corresp,'#') eq $my_id]"/>
448 <xsl:variable name="my_disamb" select="$my_morph-seg//tei:fs/tei:f[@name eq 'disamb']" as="node()"/>
449 <xsl:variable name="my_choice-id" select="substring-after($my_disamb//tei:f[@name eq 'choice']/@fVal,'#')" as="xs:string"/>
450 <xsl:variable name="my_choice-lex" select="$my_morph-seg//tei:f[@name eq 'interps']/tei:fs[@type eq 'lex'][descendant::tei:symbol[@xml:id eq $my_choice-id]]" as="node()"/>
451 <xsl:variable name="chosen-msd" as="xs:string" select="$my_choice-lex/descendant::tei:symbol[@xml:id eq $my_choice-id]/@value"/>
452 <xsl:variable name="preceding" select="preceding-sibling::*[local-name(.) eq $my_name]"/>
453 <xsl:variable name="preceding-count" select="count($preceding)"/>
banspe726b4a2022-03-28 05:47:45 +0200454 <!--<xsl:variable name="outside-preceding-count" as="xs:integer">
bansp3e5b20c2022-03-18 20:22:31 +0100455 <xsl:choose>
banspe726b4a2022-03-28 05:47:45 +0200456 <xsl:when test="self::tei:s or self::tei:p"> <!-\- THIS NEEDS TO BE REVISITED AFTER THIS TEMPLATE HAS BECOME MORE SPECIFIC -\->
bansp3e5b20c2022-03-18 20:22:31 +0100457 <xsl:choose>
banspe726b4a2022-03-28 05:47:45 +0200458 <xsl:when test="$preceding-count"> commented out for now
bansp3e5b20c2022-03-18 20:22:31 +0100459 <xsl:sequence select="
460 sum(for $p in $preceding
461 return
462 count($p/descendant::*))"/>
463 </xsl:when>
464 <xsl:otherwise>
465 <xsl:sequence select="0"/>
466 </xsl:otherwise>
467 </xsl:choose>
468 </xsl:when>
469 <xsl:otherwise>
470 <xsl:sequence select="0"/>
471 </xsl:otherwise>
472 </xsl:choose>
banspe726b4a2022-03-28 05:47:45 +0200473 </xsl:variable>-->
474 <xsl:variable name="my_index" select="$index + 1 + $preceding-count" as="xs:integer"/>
bansp3e5b20c2022-03-18 20:22:31 +0100475
476 <xsl:variable name="start" as="xs:integer">
477 <xsl:variable name="numbers" select="substring-after(substring-before(@corresp,')'),',')"/>
478 <xsl:sequence select="xs:integer(substring-before($numbers,','))"/>
479 </xsl:variable>
480 <xsl:variable name="end" as="xs:integer" select="f:calc_content_length(.)">
481 </xsl:variable>
482 <xsl:element name="span" namespace="{$KorAP_namespace}">
483 <xsl:attribute name="id" select="'s' || $my_index"/>
484 <xsl:attribute name="from" select="$start"/>
485 <xsl:attribute name="to" select="$end"/>
486 <xsl:attribute name="l" select="f:compute_nesting(.)"/>
487 <xsl:element name="fs" namespace="http://www.tei-c.org/ns/1.0">
488 <xsl:attribute name="type" select="'lex'"/>
489 <xsl:element name="f" namespace="http://www.tei-c.org/ns/1.0">
490 <xsl:attribute name="name" select="'lex'"/>
491 <xsl:element name="fs" namespace="http://www.tei-c.org/ns/1.0">
492 <xsl:comment select="$my_morph-seg//tei:fs/tei:f[@name eq 'orth']/tei:string"/>
493
494
495 <xsl:element name="f" namespace="http://www.tei-c.org/ns/1.0">
496 <xsl:attribute name="name" select="'lemma'"/>
497 <xsl:value-of select="$my_choice-lex/tei:f[@name eq 'base']/tei:string"/>
498 </xsl:element>
499 <xsl:element name="f" namespace="http://www.tei-c.org/ns/1.0">
500 <xsl:attribute name="name" select="'pos'"/>
501 <xsl:value-of select="$my_choice-lex/tei:f[@name eq 'ctag']/tei:symbol/@value"/>
502 </xsl:element>
503 <xsl:if test="string-length($chosen-msd)">
504 <xsl:element name="f" namespace="http://www.tei-c.org/ns/1.0">
505 <xsl:attribute name="name" select="'msd'"/>
506 <xsl:value-of select="$chosen-msd"/>
507 </xsl:element>
508 </xsl:if>
509 <xsl:if test="$my_morph-seg//tei:fs/tei:f[@name eq 'nps']">
510 <xsl:element name="f" namespace="http://www.tei-c.org/ns/1.0">
511 <xsl:attribute name="name" select="'join'"/>
512 <xsl:value-of select="'left'"/>
513 </xsl:element>
514 </xsl:if>
515 </xsl:element>
516 </xsl:element>
517 </xsl:element>
518 </xsl:element>
banspe726b4a2022-03-28 05:47:45 +0200519 <xsl:apply-templates mode="morpho">
bansp3e5b20c2022-03-18 20:22:31 +0100520 <xsl:with-param name="ini" select="$start" as="xs:integer"/>
521 <xsl:with-param name="fin" select="$end" as="xs:integer"/>
522 <xsl:with-param name="index" select="$my_index"/>
523 <xsl:with-param name="ann_morphosyntax.xml" select="$ann_morphosyntax.xml" as="document-node()"/>
banspe726b4a2022-03-28 05:47:45 +0200524 </xsl:apply-templates>-->
bansp3e5b20c2022-03-18 20:22:31 +0100525 </xsl:template>
banspe726b4a2022-03-28 05:47:45 +0200526
bansp5f841732022-03-16 06:27:31 +0100527 <!-- ************************** TEXT header ******************* -->
528
529 <xsl:template name="create_text_header">
530 <xsl:param name="text.xml" as="document-node()"/>
banspe726b4a2022-03-28 05:47:45 +0200531 <xsl:param name="compoundID" as="xs:string"/>
bansp5f841732022-03-16 06:27:31 +0100532 <xsl:param name="target" as="xs:string"/>
533
534 <!-- create the local header.xml file -->
535 <xsl:result-document encoding="UTF-8" method="xml" indent="yes"
536 xpath-default-namespace="http://ids-mannheim.de/ns/KorAP" href="{$target}">
537
538 <idsHeader type="document" pattern="text" status="new" version="1.1" TEIform="teiHeader">
banspe726b4a2022-03-28 05:47:45 +0200539 <xsl:apply-templates select="$text.xml//tei:TEI/tei:teiHeader/tei:*" mode="text">
540 <xsl:with-param name="compoundID" as="xs:string" select="$compoundID" tunnel="yes"/>
541 </xsl:apply-templates>
bansp5f841732022-03-16 06:27:31 +0100542 </idsHeader>
543 </xsl:result-document>
544 </xsl:template>
545
546 <xsl:template match="tei:fileDesc" mode="text">
bansp9103aab2022-03-19 05:10:21 +0100547 <xsl:element name="{local-name()}">
bansp5f841732022-03-16 06:27:31 +0100548 <xsl:apply-templates mode="text"/>
bansp9103aab2022-03-19 05:10:21 +0100549 </xsl:element>
bansp5f841732022-03-16 06:27:31 +0100550 </xsl:template>
551
552 <xsl:template match="tei:title" mode="text">
553 <t.title>
554 <xsl:apply-templates/>
555 </t.title>
556 </xsl:template>
557
558 <xsl:template match="tei:titleStmt" mode="text">
banspe726b4a2022-03-28 05:47:45 +0200559 <xsl:param name="compoundID" as="xs:string" tunnel="yes"/>
bansp5f841732022-03-16 06:27:31 +0100560 <titleStmt>
561 <textSigle>
banspe726b4a2022-03-28 05:47:45 +0200562 <xsl:value-of select="$compoundID"/>
bansp5f841732022-03-16 06:27:31 +0100563 </textSigle>
564 <xsl:apply-templates mode="text"/>
565 </titleStmt>
566 </xsl:template>
567
bansp9103aab2022-03-19 05:10:21 +0100568 <xsl:template match="tei:publicationStmt" mode="text">
569 <xsl:element name="{local-name()}">
570 <xsl:apply-templates mode="text"/>
571 </xsl:element>
572 </xsl:template>
573
574 <xsl:template match="tei:availability" mode="text">
575 <xsl:element name="{local-name()}">
576 <xsl:apply-templates mode="text" select="@* | *"/>
577 </xsl:element>
578 </xsl:template>
579
580 <xsl:template match="tei:profileDesc" mode="text">
581 <xsl:element name="{local-name()}">
582 <xsl:apply-templates mode="text"/>
583 </xsl:element>
584 </xsl:template>
bansp5f841732022-03-16 06:27:31 +0100585
bansp9103aab2022-03-19 05:10:21 +0100586 <xsl:template match="tei:textClass" mode="text">
587 <xsl:element name="{local-name()}">
588 <xsl:apply-templates mode="text" select="@* | *"/>
589 </xsl:element>
590 </xsl:template>
591
592 <xsl:template match="tei:catRef" mode="text corpus">
593 <xsl:element name="{local-name()}">
594 <xsl:apply-templates mode="text" select="@* | *"/>
595 </xsl:element>
596 </xsl:template>
597
598 <xsl:template match="@status | @scheme | @target | @type | @xml:id[ancestor::tei:classDecl] | @xml:lang" mode="text corpus">
599 <xsl:copy-of select="."/>
600 </xsl:template>
601
602 <xsl:template match="tei:p" mode="text corpus">
603 <xsl:element name="{local-name()}">
604 <xsl:apply-templates mode="header-text"/>
605 </xsl:element>
606 </xsl:template>
607
608
609 <!-- OPTIMIZATION has to take modes into account -->
bansp5e2d1c02022-03-10 04:51:40 +0100610 <!-- ************************** CORPUS header ******************* -->
611 <xsl:template name="create_corpus_header">
612 <xsl:param name="text.xml" as="document-node()"/>
bansp5f841732022-03-16 06:27:31 +0100613 <xsl:param name="target" as="xs:string"/>
bansp5e2d1c02022-03-10 04:51:40 +0100614
615 <!-- create the corpus-level header.xml file -->
bansp5f841732022-03-16 06:27:31 +0100616 <xsl:result-document encoding="UTF-8" method="xml" indent="yes" href="{$target}">
bansp5e2d1c02022-03-10 04:51:40 +0100617
618 <!--doctype-public="{$publicDoctypeI5}"
619 doctype-system="{$systemDoctypeI5}">
620 these are, sadly, useless
621 -->
622
623 <idsHeader type="corpus" pattern="text" status="new" version="1.1" TEIform="teiHeader">
bansp9103aab2022-03-19 05:10:21 +0100624 <xsl:apply-templates select="$text.xml/tei:teiCorpus/tei:teiHeader/tei:*" mode="corpus"/>
bansp5e2d1c02022-03-10 04:51:40 +0100625 </idsHeader>
626 </xsl:result-document>
627 </xsl:template>
628
629 <xsl:template match="tei:fileDesc" mode="corpus">
bansp9103aab2022-03-19 05:10:21 +0100630 <xsl:element name="{local-name()}">
bansp5e2d1c02022-03-10 04:51:40 +0100631 <xsl:apply-templates mode="corpus"/>
bansp9103aab2022-03-19 05:10:21 +0100632 </xsl:element>
bansp5e2d1c02022-03-10 04:51:40 +0100633 </xsl:template>
bansp9103aab2022-03-19 05:10:21 +0100634
bansp5e2d1c02022-03-10 04:51:40 +0100635
636 <xsl:template match="tei:title" mode="corpus">
637 <c.title>
bansp9103aab2022-03-19 05:10:21 +0100638 <xsl:apply-templates mode="corpus" select="@*"/>
639 <xsl:apply-templates mode="header-text"/>
bansp5e2d1c02022-03-10 04:51:40 +0100640 </c.title>
641 </xsl:template>
642
643 <xsl:template match="tei:titleStmt" mode="corpus">
644 <titleStmt>
645 <korpusSigle>
646 <xsl:value-of select="$corpusID"/>
647 </korpusSigle>
648 <xsl:apply-templates mode="corpus"/>
649 </titleStmt>
650 </xsl:template>
651
bansp9103aab2022-03-19 05:10:21 +0100652 <xsl:template match="tei:publicationStmt" mode="corpus">
653 <xsl:element name="{local-name()}">
654 <xsl:apply-templates mode="corpus"/>
655 </xsl:element>
656 </xsl:template>
657
658 <xsl:template match="tei:availability" mode="corpus">
659 <xsl:element name="{local-name()}">
660 <xsl:apply-templates mode="corpus" select="@* | *"/>
661 </xsl:element>
662 </xsl:template>
663
664 <xsl:template match="tei:encodingDesc" mode="corpus">
665 <xsl:element name="{local-name()}">
666 <xsl:apply-templates mode="corpus"/>
667 </xsl:element>
668 </xsl:template>
669
670 <xsl:template match="tei:classDecl | tei:taxonomy | tei:category | tei:taxonomy/tei:bibl" mode="corpus">
671 <xsl:element name="{local-name()}">
672 <xsl:apply-templates mode="corpus" select="@* | *"/>
673 </xsl:element>
674 </xsl:template>
675
676 <xsl:template match="tei:bibl/tei:title | tei:edition | tei:desc" mode="corpus">
677 <xsl:element name="{local-name()}">
678 <xsl:apply-templates mode="corpus" select="@*"/>
679 <xsl:apply-templates mode="header-text"/>
680 </xsl:element>
681 </xsl:template>
682<!--
683 <xsl:template match="tei:textClass" mode="corpus">
684 <xsl:element name="{local-name()}">
685 <xsl:apply-templates mode="corpus" select="@* | *"/>
686 </xsl:element>
687 </xsl:template>
688
689 <xsl:template match="tei:catRef" mode="corpus">
690 <xsl:element name="{local-name()}">
691 <xsl:apply-templates mode="corpus" select="@* | *"/>
692 </xsl:element>
693 </xsl:template>
694-->
bansp5e2d1c02022-03-10 04:51:40 +0100695
696
697
698 <!-- this template can be called by the XSPEC test; TODO: find a way to call the main() template directly -->
699 <!-- I have not fully handled the param transmission, which would have to be kludged in just for the sake of XSPec,
700 because I'm disabling this for now, due to XSpec design issues; relevant links, a.o.:
701
702 https://stackoverflow.com/questions/64933277/what-is-the-cause-of-error-cannot-execute-xslresult-document-while-evaluating
703 https://www.balisage.net/Proceedings/vol25/html/Galtman01/BalisageVol25-Galtman01.html
704
705 In short: the internal design of XSpec forces kludges when one wants to use xsl:result-document in their stylesheets. But I don't
706 want to be strangled by kludges at the beginning of work, I've already lost quite a bit of time on this investigation,
707 I will therefore "just code" and then can think of externalizing bits of templates if we want to play with tests. For now,
708 I don't want to have to handle context items is a special way inside variables, etc., because I'm not sure it's worth it.
709
710 -->
711 <!--<xsl:template name="test_full">
712 <xsl:param name="corpusID"/>
713 <xsl:param name="docID"/>
714 <xsl:param name="textID"/>
715 <xsl:call-template name="xsl:initial-template"/>
716 </xsl:template>-->
717
Akron9a8ee3e2022-01-31 13:51:49 +0100718</xsl:stylesheet>