blob: acc425d44cc81b1190774dc6b7e121a1a81a621c [file] [log] [blame]
Akron9a8ee3e2022-01-31 13:51:49 +01001<?xml version="1.0" encoding="UTF-8"?>
2<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
bansp5e2d1c02022-03-10 04:51:40 +01003 xmlns:xs="http://www.w3.org/2001/XMLSchema" xmlns:nkjp="http://www.nkjp.pl/ns/1.0"
4 xmlns:tei="http://www.tei-c.org/ns/1.0" xmlns:f="func"
Piotr Banskif8af3a92022-05-23 03:20:10 +02005 xmlns:fn="http://www.w3.org/2005/xpath-functions"
bansp5e2d1c02022-03-10 04:51:40 +01006 xmlns:map="http://www.w3.org/2005/xpath-functions/map" exclude-result-prefixes="xs f map nkjp tei"
7 version="3.0" expand-text="yes">
Akron9a8ee3e2022-01-31 13:51:49 +01008
banspe726b4a2022-03-28 05:47:45 +02009
10<!-- PARAMETERS -->
bansp5e2d1c02022-03-10 04:51:40 +010011
bansp8f6700b2022-03-27 05:27:09 +020012 <xsl:param name="sourceDir" select="'test/resources/nkjp2korap_sample2'" as="xs:string"/>
banspd1bf1db2022-04-04 02:16:24 +020013 <!-- the directory containing NKJP files, in the form of a collection of text-level dirs
14 (that is how we know both the $corpusID and the $docID) -->
Akron9a8ee3e2022-01-31 13:51:49 +010015
bansp8f6700b2022-03-27 05:27:09 +020016 <xsl:param name="targetDir" select="'test/output'" as="xs:string"/>
banspd1bf1db2022-04-04 02:16:24 +020017 <!-- where the corpus/document/text/annotations hierarchy is going to be created -->
banspf2b24e62022-03-28 18:12:08 +020018
19 <xsl:param name="skip_docID" as="xs:string">
banspb5992532022-03-29 15:55:44 +020020 <xsl:value-of select="'HellerPodgladanie,IsakowiczZaleskiMoje,KolakowskiOco,MysliwskiKamien,WilkWilczy,ZycieWarszawy_Zycie'"/>
21 </xsl:param>
22 <!-- comma-separated list of document IDs to be skipped from processing
banspf2b24e62022-03-28 18:12:08 +020023 example: HellerPodgladanie,KOT
banspd1bf1db2022-04-04 02:16:24 +020024 no functionality beyond string identity is supported
25 (this is just for testing) -->
banspb5992532022-03-29 15:55:44 +020026
bansp8f6700b2022-03-27 05:27:09 +020027
bansp9dc10002022-05-17 22:33:34 +020028<!-- VARIABLES (= constants...) -->
banspe726b4a2022-03-28 05:47:45 +020029
30 <xsl:variable name="corpusID" as="xs:string" select="'NKJP'" static="yes"/>
31 <xsl:variable name="docID" as="xs:string" select="'NKJP'" static="yes"/>
bansp8f6700b2022-03-27 05:27:09 +020032
33 <xsl:variable name="targetCorpusDir_slashed" select="$targetDir || '/' || $corpusID || '/'" as="xs:string"/>
bansp5e2d1c02022-03-10 04:51:40 +010034
banspd1bf1db2022-04-04 02:16:24 +020035 <xsl:variable name="systemDoctypeI5" as="xs:string"
36 select="'http://corpora.ids-mannheim.de/I5/DTD/i5.dtd'" static="true"/>
bansp5e2d1c02022-03-10 04:51:40 +010037
banspd1bf1db2022-04-04 02:16:24 +020038 <xsl:variable name="publicDoctypeI5" as="xs:string" static="true"
39 select="'-//IDS//DTD I5 1.0//EN'"/>
bansp5e2d1c02022-03-10 04:51:40 +010040
banspd1bf1db2022-04-04 02:16:24 +020041 <xsl:variable name="KorAP_namespace" static="true" as="xs:string"
42 select="'http://ids-mannheim.de/ns/KorAP'"/>
bansp5e2d1c02022-03-10 04:51:40 +010043
bansp5f841732022-03-16 06:27:31 +010044 <xsl:variable name="KorAP-XML_version" select="'KorAP-0.4'" as="xs:string" static="true"/>
45 <!-- this is only a bit funny -->
46
banspe726b4a2022-03-28 05:47:45 +020047 <xsl:variable name="collection_params" as="xs:string" static="yes"
48 select="'recurse=yes;validation=strip;select=text.xml;content-type=application/xml;on-error=warning;xinclude=yes'"
49 />
50 <!-- see https://www.saxonica.com/documentation11/index.html#!sourcedocs/collections/collection-directories -->
51
52 <xsl:variable name="collection_of_text" select="collection($sourceDir || '?' || $collection_params)" as="document-node()+"/>
banspd1bf1db2022-04-04 02:16:24 +020053
54<!-- these two 'flags' are meant to increase the readability of the code
55 they are used for the output of the calc_offsets() function, where the
Piotr Banski4f4c2d22022-05-19 01:44:32 +020056 returned value is a sequence, (start, end)
57
58 remove together with the function!
59
60 -->
banspd1bf1db2022-04-04 02:16:24 +020061 <xsl:variable name="OFFSET_START" as="xs:integer" static="yes" select="1"/>
62 <xsl:variable name="OFFSET_END" as="xs:integer" static="yes" select="2"/>
banspb5992532022-03-29 15:55:44 +020063
64
banspe726b4a2022-03-28 05:47:45 +020065<!-- MODES -->
bansp5e2d1c02022-03-10 04:51:40 +010066
67 <xsl:mode name="corpus" on-no-match="deep-skip"/>
68 <xsl:mode name="text" on-no-match="deep-skip"/>
bansp9103aab2022-03-19 05:10:21 +010069 <xsl:mode name="header-text" on-no-match="text-only-copy"/>
bansp5e2d1c02022-03-10 04:51:40 +010070
banspe726b4a2022-03-28 05:47:45 +020071
72 <!-- FUNCTIONS -->
73
bansp5f841732022-03-16 06:27:31 +010074 <xsl:function name="f:compute_nesting" as="xs:integer">
banspd1bf1db2022-04-04 02:16:24 +020075 <xsl:param name="node" as="element()"/>
bansp5f841732022-03-16 06:27:31 +010076 <xsl:variable name="rel_depth"
77 select="count($node/ancestor-or-self::*[local-name(.) ne 'TEI'][local-name(.) ne 'teiCorpus'])"
78 as="xs:integer"/>
bansp5f841732022-03-16 06:27:31 +010079 <xsl:sequence select="$rel_depth"/>
80 </xsl:function>
81
bansp9dc10002022-05-17 22:33:34 +020082<xsl:function name="f:is_preceded_by_ws" as="xs:boolean">
83 <xsl:param name="node" as="element()"/>
84 <xsl:choose>
85 <xsl:when test="local-name($node) eq 'seg'">
Piotr Banski4f4c2d22022-05-19 01:44:32 +020086 <xsl:sequence
87 select="not(exists($node/@nkjp:nps)) and not($node[count(preceding-sibling::tei:seg) eq 0]/ancestor::tei:s[count(preceding-sibling::tei:s) eq 0]/ancestor::tei:p[count(preceding-sibling::tei:p) eq 0])"
88 />
89 <!--and not($node/ancestor::tei:s[count(preceding-sibling::tei:s) eq 0]/ancestor::tei:p[count(preceding-sibling::tei:p) eq 0])-->
90
bansp9dc10002022-05-17 22:33:34 +020091 </xsl:when>
92 <xsl:when test="local-name($node) eq 's'">
93 <xsl:message select="'s - prec s: ' || $node/preceding-sibling::tei:s"/>
94 <xsl:message select="'same s - prec p: ' || $node/ancestor::tei:p[1]/preceding-sibling::tei:p || '&#10;'"/>
95
96 <xsl:sequence select="exists($node/preceding-sibling::tei:s) or exists($node/ancestor::tei:p[1]/preceding-sibling::tei:p)"/>
97 </xsl:when>
98 <xsl:when test="local-name($node) eq 'p'">
99 <xsl:message select="'p : ' || $node/preceding-sibling::tei:p"></xsl:message>
100 <xsl:sequence select="exists($node/preceding-sibling::tei:p)"/>
101 </xsl:when>
102 <xsl:otherwise>
103 <xsl:message terminate="yes" select="'Wrong argument passed to f:is_preceded_by_ws(): ' || local-name($node) || ' Only p, s, seg are allowed.'"></xsl:message>
104 </xsl:otherwise>
105 </xsl:choose>
106</xsl:function>
banspd1bf1db2022-04-04 02:16:24 +0200107
Piotr Banski4f4c2d22022-05-19 01:44:32 +0200108
banspd1bf1db2022-04-04 02:16:24 +0200109
Piotr Banski4f4c2d22022-05-19 01:44:32 +0200110
bansp5e2d1c02022-03-10 04:51:40 +0100111
banspb5992532022-03-29 15:55:44 +0200112
113<!-- UTILITY TEMPLATES -->
114
bansp9103aab2022-03-19 05:10:21 +0100115 <xsl:template match="@default" mode="#all"/>
bansp97ba7ce2022-03-26 05:14:06 +0100116 <!-- this is to delete some auto-inserted attribute throughout -->
bansp9103aab2022-03-19 05:10:21 +0100117
bansp9dc10002022-05-17 22:33:34 +0200118 <!--<xsl:template match="tei:w" mode="#all"/> w is better than ab, now ... -->
banspe726b4a2022-03-28 05:47:45 +0200119<!-- NKJP-SGJP has apparently resigned from standoff representations by adding <w> everywhere;
120 for the time being, we'll just stick to the standoff offsets, although that may need to
121 be revisited as the NKJP format has now began to stray from its schemas and assumptions -->
bansp8f6700b2022-03-27 05:27:09 +0200122
banspe726b4a2022-03-28 05:47:45 +0200123 <xsl:template match="tei:choice" mode="#all"/>
124<!-- THIS IS ONLY TEMPORARY,
125 because an interesting challenge came up where I will
126 probably have to abandon straightforward mapping because of TOKENIZATION alternatives;
127
128 but now, I just want this stylesheet to work, even if it eats some occasional token (which it now does, 'komuÅ›' and 'czym' vanish)
129 -->
bansp8f6700b2022-03-27 05:27:09 +0200130
banspb5992532022-03-29 15:55:44 +0200131
132 <!-- MAIN PROCESSING -->
133
134
bansp5e2d1c02022-03-10 04:51:40 +0100135 <xsl:template name="xsl:initial-template">
banspf2b24e62022-03-28 18:12:08 +0200136 <xsl:variable name="IDs_to_skip" select="tokenize($skip_docID,',')" as="xs:string*"/>
banspd1bf1db2022-04-04 02:16:24 +0200137
banspe726b4a2022-03-28 05:47:45 +0200138 <!-- we only want to call the template below once, and we process a random NKJP corpus file for that purpose,
bansp8f6700b2022-03-27 05:27:09 +0200139 because all we need is the main corpus header, and we can (should) get to that from any NKJP corpus document -->
140 <xsl:call-template name="create_corpus_header">
banspe726b4a2022-03-28 05:47:45 +0200141 <xsl:with-param name="text.xml" select="$collection_of_text[1]" as="document-node()"/>
bansp8f6700b2022-03-27 05:27:09 +0200142 <xsl:with-param name="target" select="$targetCorpusDir_slashed || 'header.xml'" as="xs:string"/>
143 </xsl:call-template>
144
banspe726b4a2022-03-28 05:47:45 +0200145 <xsl:for-each select="$collection_of_text">
146 <xsl:variable name="my_dir" as="xs:string" select="replace(base-uri(),'/text\.xml','')"/>
147 <xsl:variable name="my_textID" as="xs:string" select="tokenize($my_dir,'/')[last()]"/>
148 <xsl:variable name="ann_morphosyntax.uri" select="$my_dir || '/ann_morphosyntax.xml'" as="xs:string"/>
149 <xsl:variable name="ann_segmentation.uri" select="$my_dir || '/ann_segmentation.xml'" as="xs:string"/>
150
banspf2b24e62022-03-28 18:12:08 +0200151 <xsl:choose>
152 <xsl:when test="$my_textID = $IDs_to_skip"/>
bansp9dc10002022-05-17 22:33:34 +0200153 <!-- this is a utility step, for when we want to ignore some texts for any reason (debugging, selective update) -->
154
banspf2b24e62022-03-28 18:12:08 +0200155 <xsl:otherwise>
banspd1bf1db2022-04-04 02:16:24 +0200156
bansp9dc10002022-05-17 22:33:34 +0200157 <!--<xsl:message select="f:calc_offsets(doc($ann_segmentation.uri)//tei:body/tei:p[1]/tei:s[2]/tei:seg[1],false())"/>-->
banspd1bf1db2022-04-04 02:16:24 +0200158
bansp9dc10002022-05-17 22:33:34 +0200159<!-- <xsl:message select="doc($ann_segmentation.uri)//tei:body/tei:p[1]/tei:s[1]/tei:seg[1] || f:is_preceded_by_ws(doc($ann_segmentation.uri)//tei:body/tei:p[1]/tei:s[1]/tei:seg[1])"/>
160 <xsl:message select="doc($ann_segmentation.uri)//tei:body/tei:p[1]/tei:s[1]/tei:seg[3] || f:is_preceded_by_ws(doc($ann_segmentation.uri)//tei:body/tei:p[1]/tei:s[1]/tei:seg[3])"/>
161-->
162 <xsl:call-template name="process_single_sample">
banspf2b24e62022-03-28 18:12:08 +0200163 <xsl:with-param name="text.xml" as="document-node()" select="."/>
164 <xsl:with-param name="ann_morphosyntax.xml" as="document-node()"
165 select="doc($ann_morphosyntax.uri)"/>
166 <xsl:with-param name="ann_segmentation.xml" as="document-node()"
167 select="doc($ann_segmentation.uri)"/>
168 <xsl:with-param name="my_textID" select="$my_textID" as="xs:string"/>
bansp9dc10002022-05-17 22:33:34 +0200169 </xsl:call-template>
banspf2b24e62022-03-28 18:12:08 +0200170 </xsl:otherwise>
171 </xsl:choose>
banspe726b4a2022-03-28 05:47:45 +0200172 </xsl:for-each>
bansp8f6700b2022-03-27 05:27:09 +0200173 </xsl:template>
174
175 <xsl:template name="process_single_sample">
banspe726b4a2022-03-28 05:47:45 +0200176 <xsl:param name="text.xml" as="document-node()"/>
177 <xsl:param name="ann_morphosyntax.xml" as="document-node()"/>
178 <xsl:param name="ann_segmentation.xml" as="document-node()"/>
banspd1bf1db2022-04-04 02:16:24 +0200179 <xsl:param name="my_textID" as="xs:string" select="'0-BAD_textID'"/>
bansp9dc10002022-05-17 22:33:34 +0200180 <!-- empty textID should never happen, but if it does, it will be signalled at the top of the output -->
banspe726b4a2022-03-28 05:47:45 +0200181
182 <xsl:variable name="targetBaseDir" as="xs:string" select="$targetCorpusDir_slashed || $docID || '/' || $my_textID"/>
183
184 <xsl:variable name="compoundID" as="xs:string"
185 select="$corpusID || '_' || $docID || '.' || $my_textID"/>
186 <!-- this is what occurs in the text and data layers as @docid -->
187
188
bansp5e2d1c02022-03-10 04:51:40 +0100189 <xsl:call-template name="create_data">
bansp9dc10002022-05-17 22:33:34 +0200190 <!--<xsl:with-param name="text.xml" select="$text.xml" as="document-node()"/>-->
191 <xsl:with-param name="ann_segmentation.xml" select="$ann_segmentation.xml" as="document-node()"/>
banspe726b4a2022-03-28 05:47:45 +0200192 <xsl:with-param name="compoundID" select="$compoundID" as="xs:string"/>
193 <xsl:with-param name="target" select="$targetBaseDir || '/data.xml'" as="xs:string"/>
bansp5f841732022-03-16 06:27:31 +0100194 </xsl:call-template>
195
Piotr Banski4f4c2d22022-05-19 01:44:32 +0200196 <xsl:call-template name="create_struct">
banspe726b4a2022-03-28 05:47:45 +0200197 <xsl:with-param name="compoundID" select="$compoundID" as="xs:string"/>
bansp5f841732022-03-16 06:27:31 +0100198 <xsl:with-param name="ann_segmentation.xml" select="$ann_segmentation.xml"
199 as="document-node()"/>
banspe726b4a2022-03-28 05:47:45 +0200200 <xsl:with-param name="target" select="$targetBaseDir || '/struct/structure.xml'" as="xs:string"
bansp5f841732022-03-16 06:27:31 +0100201 />
202 </xsl:call-template>
Piotr Banski4f4c2d22022-05-19 01:44:32 +0200203
bansp9dc10002022-05-17 22:33:34 +0200204<!-- <xsl:call-template name="create_morpho">
bansp5f841732022-03-16 06:27:31 +0100205 <xsl:with-param name="text.xml" select="$text.xml" as="document-node()"/>
banspe726b4a2022-03-28 05:47:45 +0200206 <xsl:with-param name="compoundID" select="$compoundID" as="xs:string"/>
bansp3e5b20c2022-03-18 20:22:31 +0100207 <xsl:with-param name="ann_segmentation.xml" select="$ann_segmentation.xml"
208 as="document-node()"/>
bansp5f841732022-03-16 06:27:31 +0100209 <xsl:with-param name="ann_morphosyntax.xml" select="$ann_morphosyntax.xml"
210 as="document-node()"/>
banspe726b4a2022-03-28 05:47:45 +0200211 <xsl:with-param name="target" select="$targetBaseDir || '/nkjp/morpho.xml'" as="xs:string"/>
bansp5e2d1c02022-03-10 04:51:40 +0100212 </xsl:call-template>
bansp9dc10002022-05-17 22:33:34 +0200213-->
bansp5e2d1c02022-03-10 04:51:40 +0100214 <xsl:call-template name="create_text_header">
215 <xsl:with-param name="text.xml" select="$text.xml" as="document-node()"/>
banspe726b4a2022-03-28 05:47:45 +0200216 <xsl:with-param name="compoundID" select="$compoundID" as="xs:string"/>
217 <xsl:with-param name="target" select="$targetBaseDir || '/header.xml'" as="xs:string"/>
bansp5e2d1c02022-03-10 04:51:40 +0100218 </xsl:call-template>
219
bansp5e2d1c02022-03-10 04:51:40 +0100220 </xsl:template>
221
222 <!-- ************************** data.xml ******************* -->
223
224 <xsl:template name="create_data">
bansp9dc10002022-05-17 22:33:34 +0200225 <!--<xsl:param name="text.xml" as="document-node()"/>-->
226 <xsl:param name="ann_segmentation.xml" as="document-node()"/>
banspe726b4a2022-03-28 05:47:45 +0200227 <xsl:param name="compoundID" as="xs:string"/>
bansp5f841732022-03-16 06:27:31 +0100228 <xsl:param name="target" as="xs:string"/>
bansp5e2d1c02022-03-10 04:51:40 +0100229 <!-- create the data.xml file -->
230 <xsl:result-document encoding="UTF-8" method="xml" indent="yes"
bansp5f841732022-03-16 06:27:31 +0100231 xpath-default-namespace="{$KorAP_namespace}" href="{$target}">
bansp5e2d1c02022-03-10 04:51:40 +0100232
Akron9a8ee3e2022-01-31 13:51:49 +0100233 <xsl:processing-instruction name="xml-model">href=&quot;text.rng&quot; type=&quot;application/xml&quot; schematypens=&quot;http://relaxng.org/ns/structure/1.0&quot;</xsl:processing-instruction>
bansp5e2d1c02022-03-10 04:51:40 +0100234 <xsl:element name="raw_text" namespace="{$KorAP_namespace}">
bansp5f841732022-03-16 06:27:31 +0100235 <xsl:attribute name="docid" select="$compoundID"/>
bansp5e2d1c02022-03-10 04:51:40 +0100236 <xsl:element name="metadata" namespace="{$KorAP_namespace}">
237 <xsl:attribute name="file" select="'metadata.xml'"/>
238 </xsl:element>
239
240 <xsl:element name="text" namespace="{$KorAP_namespace}">
bansp9dc10002022-05-17 22:33:34 +0200241 <xsl:variable name="content" as="xs:string+">
Piotr Banski4f4c2d22022-05-19 01:44:32 +0200242 <xsl:for-each select="$ann_segmentation.xml/tei:teiCorpus/tei:TEI/tei:text/tei:body/tei:p/tei:s//tei:seg[count(@nkjp:rejected) eq 0]">
bansp9dc10002022-05-17 22:33:34 +0200243 <xsl:sequence select="
244 if (f:is_preceded_by_ws(.)) then
245 ' '
246 else
247 '', ./tei:w"/>
248 </xsl:for-each>
249 </xsl:variable>
250 <xsl:value-of select="string-join($content)"/>
bansp5e2d1c02022-03-10 04:51:40 +0100251 </xsl:element>
Akron9a8ee3e2022-01-31 13:51:49 +0100252 </xsl:element>
banspf79443e2022-02-25 14:25:33 +0100253 </xsl:result-document>
Akron9a8ee3e2022-01-31 13:51:49 +0100254 </xsl:template>
255
bansp5f841732022-03-16 06:27:31 +0100256 <!-- ************************** struct ******************* -->
257
258 <xsl:template name="create_struct">
banspe726b4a2022-03-28 05:47:45 +0200259 <xsl:param name="compoundID" as="xs:string"/>
bansp5f841732022-03-16 06:27:31 +0100260 <xsl:param name="ann_segmentation.xml" as="document-node()"/>
261 <xsl:param name="target" as="xs:string"/>
Piotr Banski4f4c2d22022-05-19 01:44:32 +0200262
263
Piotr Banskif8af3a92022-05-23 03:20:10 +0200264<!-- map the entire document, so that the processing only takes place once, and for fast lookups
265
266 MOVE THIS UP TO process_single_sample - to re-use it
267
268 -->
Piotr Banski4f4c2d22022-05-19 01:44:32 +0200269 <xsl:variable name="map_w" as="map(xs:untypedAtomic,item()+)">
270 <xsl:variable name="segs" select="$ann_segmentation.xml/tei:teiCorpus/tei:TEI/tei:text/tei:body/tei:p/tei:s//tei:seg[count(@nkjp:rejected) eq 0]" as="element()+"/>
271 <xsl:map>
272 <xsl:for-each-group select="$segs" group-by="ancestor::tei:p[1]/@xml:id">
273 <xsl:variable name="current-p" select="current-grouping-key()"/>
274 <xsl:for-each-group select="current-group()" group-by="ancestor::tei:s[1]/@xml:id">
275 <xsl:variable name="current-s" select="current-grouping-key()"/>
276 <xsl:for-each select="current-group()">
277 <xsl:map-entry key="@xml:id" select="$current-p, $current-s, position(), f:is_preceded_by_ws(.), normalize-space(tei:w)"/>
278 </xsl:for-each>
279 </xsl:for-each-group>
280 </xsl:for-each-group>
281 </xsl:map>
282 </xsl:variable>
283
Piotr Banskif8af3a92022-05-23 03:20:10 +0200284 <xsl:variable name="map_s-p" as="map(xs:untypedAtomic,item()+)">
285 <xsl:map>
286 <xsl:for-each-group select="map:keys($map_w)" group-by="map:get($map_w, .)[1]">
287 <xsl:sort select="xs:integer(substring-before(substring-after(., 'segm_'), '.'))"
288 order="ascending"/>
289 <xsl:sort select="xs:integer(substring-before(substring-after(., '.'), '-'))"
290 order="ascending"/>
291 <xsl:variable name="current-p-pos" select="fn:position()" as="xs:integer"/>
292 <!-- the above is used in the sentence loop, when we check if it's text-initial -->
293 <xsl:variable name="current-p" select="fn:current-grouping-key()" as="xs:string"/> <!--xs:untypedAtomic-->
294 <xsl:variable name="p-length" select="
295 sum(for $id in current-group()
296 return
297 string-length(map:get($map_w, $id)[5])) + count(current-group()) - xs:integer(position() ne 1) -
298 count(fn:filter(current-group(), function ($w-id) {
299 map:get($map_w, $w-id)[4] eq false()
300 }))"/>
301 <!-- The general algorithm is:
302 * count and sum the lengths of all the words
303 * add 'whitespace' for all of them (= count them and add that), and then
304 * subtract whitespace for those of them that are not actually preceded by it
305 and if the 1st word is_preceded_by_ws then subtract 1
306 because identifying that 1st word would require an extra step, we're taking a shortcut via position() -
307 and that strongly depends on the presence of the xsl:sort instructions -->
308
309 <xsl:message select="'sum: ' || sum( for $id in current-group() return string-length(map:get($map_w, $id)[5]) )"/>
310 <!--<xsl:message select="for $id in current-group() return (string-length(map:get($map_w, $id)[5]),map:get($map_w, $id)[4] )"/>-->
311 <xsl:message select="'cur-group count: ' || count(fn:current-group())"/>
312 <!--<xsl:message select="fn:for-each(current-group(), function($w-id) { map:get($map_w,$w-id)[4] eq false() } )"></xsl:message>-->
313 <xsl:message select="'subtract:' || count(fn:filter(current-group(), function($w-id) { map:get($map_w,$w-id)[4] eq false() } ))"></xsl:message>
314 <xsl:message select="'position: ' || position() || ', xs:integer(position() ne 1)=' || xs:integer(position() ne 1)"></xsl:message>
315 <xsl:message select="'p-length: ' || $p-length"/>
316
317 <xsl:map-entry key="current-grouping-key()" select="'p', position(), $p-length"/>
318
319 <xsl:message select="'p: ', $current-p || ' pos:' || position(), current-group()"/>
320
321 <xsl:for-each-group select="current-group()" group-by="map:get($map_w, .)[2]">
322 <xsl:sort select="xs:integer(substring-before(substring-after(., 'segm_'), '.'))"
323 order="ascending"/>
324 <xsl:sort select="xs:integer(substring-before(substring-after(., '.'), '-'))"
325 order="ascending"/>
326 <xsl:variable name="current-s" select="fn:current-grouping-key()" as="xs:string"/> <!--xs:untypedAtomic-->
327 <xsl:variable name="s-length" select="
328 sum(for $id in current-group()
329 return
330 string-length(map:get($map_w, $id)[5])) + count(current-group()) - xs:integer($current-p-pos ne 1) -
331 count(fn:filter(current-group(), function ($w-id) {
332 map:get($map_w, $w-id)[4] eq false()
333 }))"/>
334
335
336 <xsl:map-entry key="current-grouping-key()" select="'s', position(), $s-length, $current-p"/>
337
338 <xsl:message select="'s: ', position(), current-group()"/>
339
340 <xsl:for-each select="current-group()">
341 <xsl:sort select="map:get($map_w, .)[3]" order="ascending"/>
342 <xsl:map-entry key="." select="'w', position(), string-length(map:get($map_w, .)[5]), $current-s, map:get($map_w, .)[4]"/>
343<!-- <xsl:message select="map:get($map_w, .)[5]"/>-->
344 </xsl:for-each>
345
346 </xsl:for-each-group>
347
348
349
350 </xsl:for-each-group>
351 </xsl:map>
352 </xsl:variable>
353
354 <xsl:message select="'map_w size: ' || map:size($map_w)"/>
355 <xsl:message select="'map_s-p size: ' || map:size($map_s-p)"/>
Piotr Banski4f4c2d22022-05-19 01:44:32 +0200356
bansp5f841732022-03-16 06:27:31 +0100357 <xsl:result-document encoding="UTF-8" method="xml" indent="yes"
358 xpath-default-namespace="http://ids-mannheim.de/ns/KorAP" href="{$target}">
359 <xsl:processing-instruction name="xml-model">href=&quot;span.rng&quot; type=&quot;application/xml&quot; schematypens=&quot;http://relaxng.org/ns/structure/1.0&quot;</xsl:processing-instruction>
360 <xsl:element name="layer" namespace="{$KorAP_namespace}">
361 <xsl:attribute name="docid" select="$compoundID"/>
362 <xsl:attribute name="version" select="$KorAP-XML_version"/>
363
364 <xsl:element name="spanList" namespace="{$KorAP_namespace}">
Piotr Banski4f4c2d22022-05-19 01:44:32 +0200365 <!--<xsl:apply-templates select="$ann_segmentation.xml//tei:text" mode="struct"/> -->
bansp5f841732022-03-16 06:27:31 +0100366 </xsl:element>
367 </xsl:element>
368 </xsl:result-document>
369 </xsl:template>
370
371 <xsl:template match="tei:*" mode="struct">
372 <xsl:param name="ini" as="xs:integer" required="no" select="0"/>
373 <xsl:param name="fin" as="xs:integer" required="no" select="999999999"/>
374 <xsl:param name="index" as="xs:integer" required="no" select="1"/>
Piotr Banski4f4c2d22022-05-19 01:44:32 +0200375
bansp9dc10002022-05-17 22:33:34 +0200376 <!-- It's so spread out because I want to make sure to be able to look up the individual
bansp3e5b20c2022-03-18 20:22:31 +0100377 constituent values, should anything go wrong; optimization will come when it's worked against a larger dataset -->
bansp5f841732022-03-16 06:27:31 +0100378 <xsl:variable name="my_name" select="local-name()" as="xs:string"/>
379 <xsl:variable name="preceding" select="preceding-sibling::*[local-name(.) eq $my_name]"/>
380 <xsl:variable name="preceding-count" select="count($preceding)"/>
bansp9dc10002022-05-17 22:33:34 +0200381
bansp5f841732022-03-16 06:27:31 +0100382 <xsl:variable name="outside-preceding-count" as="xs:integer">
383 <xsl:choose>
384 <xsl:when test="self::tei:s or self::tei:p">
385 <xsl:choose>
386 <xsl:when test="$preceding-count">
387 <xsl:sequence select="
388 sum(for $p in $preceding
389 return
Piotr Banski4f4c2d22022-05-19 01:44:32 +0200390 count($p/descendant::*))"/> <!--mind @nkjp:rejected -->
bansp5f841732022-03-16 06:27:31 +0100391 </xsl:when>
392 <xsl:otherwise>
393 <xsl:sequence select="0"/>
394 </xsl:otherwise>
395 </xsl:choose>
396 </xsl:when>
397 <xsl:otherwise>
398 <xsl:sequence select="0"/>
399 </xsl:otherwise>
400 </xsl:choose>
401 </xsl:variable>
bansp9dc10002022-05-17 22:33:34 +0200402
bansp5f841732022-03-16 06:27:31 +0100403 <xsl:variable name="my_index" select="$index + 1 + $preceding-count + $outside-preceding-count"
404 as="xs:integer"/>
banspb5992532022-03-29 15:55:44 +0200405
406
407 <!--<xsl:copy select="//tei:seg[count(@nkjp:rejected) ne 0 and @nkjp:rejected ne 'true']"></xsl:copy>-->
bansp5f841732022-03-16 06:27:31 +0100408
409 <xsl:variable name="start" as="xs:integer">
410 <xsl:choose>
411 <xsl:when test="self::tei:text or self::tei:body">
412 <xsl:sequence select="0"/>
413 </xsl:when>
414 <xsl:when test="self::tei:p">
415 <xsl:variable name="first_corresp"
416 select="descendant::tei:s[1]/descendant::tei:seg[1]/attribute::corresp"
417 as="attribute(corresp)"/>
418 <xsl:variable name="numbers" select="substring-after(substring-before($first_corresp,')'),',')"/>
419 <xsl:sequence select="xs:integer(substring-before($numbers,','))"/>
420 </xsl:when>
421 <xsl:when test="self::tei:s">
422 <xsl:variable name="first_corresp"
423 select="descendant::tei:seg[1]/attribute::corresp"
424 as="attribute(corresp)"/>
425 <xsl:variable name="numbers" select="substring-after(substring-before($first_corresp,')'),',')"/>
426 <xsl:sequence select="xs:integer(substring-before($numbers,','))"/>
427 </xsl:when>
428 <xsl:when test="self::tei:seg">
429 <xsl:variable name="numbers" select="substring-after(substring-before(@corresp,')'),',')"/>
430 <xsl:sequence select="xs:integer(substring-before($numbers,','))"/>
431 </xsl:when>
432 </xsl:choose>
433 </xsl:variable>
Piotr Banski4f4c2d22022-05-19 01:44:32 +0200434
bansp5f841732022-03-16 06:27:31 +0100435 <xsl:variable name="end" as="xs:integer" select="f:calc_content_length(.)">
436 </xsl:variable>
bansp3e5b20c2022-03-18 20:22:31 +0100437
bansp5f841732022-03-16 06:27:31 +0100438 <xsl:element name="span" namespace="{$KorAP_namespace}">
439 <xsl:attribute name="id" select="'s' || $my_index"/>
440 <xsl:attribute name="from" select="$start"/>
441 <xsl:attribute name="to" select="$end"/>
442 <xsl:attribute name="l" select="f:compute_nesting(.)"/>
443 <xsl:element name="fs" namespace="http://www.tei-c.org/ns/1.0">
bansp3e5b20c2022-03-18 20:22:31 +0100444 <xsl:attribute name="type" select="'struct'"></xsl:attribute> <!-- STRUCT vs. LEX -->
bansp5f841732022-03-16 06:27:31 +0100445 <xsl:element name="f" namespace="http://www.tei-c.org/ns/1.0">
bansp3e5b20c2022-03-18 20:22:31 +0100446 <xsl:attribute name="name" select="'name'"/>
447 <xsl:value-of select="local-name()"/>
bansp5f841732022-03-16 06:27:31 +0100448 </xsl:element>
449 <xsl:if test="count(@*)">
450 <xsl:element name="f" namespace="http://www.tei-c.org/ns/1.0">
451 <xsl:attribute name="name" select="'attr'"/>
452 <xsl:element name="fs" namespace="http://www.tei-c.org/ns/1.0">
453 <xsl:attribute name="type" select="'attr'"/>
454 <xsl:for-each select="@*">
455 <xsl:element name="f" namespace="http://www.tei-c.org/ns/1.0">
456 <xsl:attribute name="name" select="local-name(.)"/>
457 <xsl:value-of select="."/>
458 </xsl:element>
459 </xsl:for-each>
460 </xsl:element>
461 </xsl:element>
462 </xsl:if>
463 </xsl:element>
464 </xsl:element>
465 <xsl:apply-templates mode="struct">
466 <xsl:with-param name="ini" select="$start" as="xs:integer"/>
467 <xsl:with-param name="fin" select="$end" as="xs:integer"/>
468 <xsl:with-param name="index" select="$my_index"/>
469 </xsl:apply-templates>
470 </xsl:template>
471
472 <!-- ************************** morpho ******************* -->
473
474 <xsl:template name="create_morpho">
475 <xsl:param name="text.xml" as="document-node()"/>
banspe726b4a2022-03-28 05:47:45 +0200476 <xsl:param name="compoundID" as="xs:string"/>
bansp3e5b20c2022-03-18 20:22:31 +0100477 <xsl:param name="ann_segmentation.xml" as="document-node()"/>
bansp5f841732022-03-16 06:27:31 +0100478 <xsl:param name="ann_morphosyntax.xml" as="document-node()"/>
479 <xsl:param name="target" as="xs:string"/>
480
481 <xsl:result-document encoding="UTF-8" method="xml" indent="yes"
482 xpath-default-namespace="http://ids-mannheim.de/ns/KorAP" href="{$target}">
483 <xsl:processing-instruction name="xml-model">href=&quot;span.rng&quot; type=&quot;application/xml&quot; schematypens=&quot;http://relaxng.org/ns/structure/1.0&quot;</xsl:processing-instruction>
bansp3e5b20c2022-03-18 20:22:31 +0100484 <xsl:element name="layer" namespace="{$KorAP_namespace}">
485 <xsl:attribute name="docid" select="$compoundID"/>
486 <xsl:attribute name="version" select="$KorAP-XML_version"/>
487
488 <xsl:element name="spanList" namespace="{$KorAP_namespace}">
489 <xsl:apply-templates select="$ann_segmentation.xml//tei:text" mode="morpho">
490 <xsl:with-param name="ann_morphosyntax.xml" select="$ann_morphosyntax.xml" as="document-node()"/>
491 </xsl:apply-templates>
492 </xsl:element>
493 </xsl:element>
bansp5f841732022-03-16 06:27:31 +0100494 </xsl:result-document>
495 </xsl:template>
496
bansp3e5b20c2022-03-18 20:22:31 +0100497 <xsl:template match="tei:*" mode="morpho">
498 <xsl:param name="ini" as="xs:integer" required="no" select="0"/>
499 <xsl:param name="fin" as="xs:integer" required="no" select="999999999"/>
500 <xsl:param name="index" as="xs:integer" required="no" select="1"/>
501 <xsl:param name="ann_morphosyntax.xml" as="document-node()"/>
502 <xsl:variable name="my_name" select="local-name()" as="xs:string"/>
503 <xsl:variable name="preceding" select="preceding-sibling::*[local-name(.) eq $my_name]"/>
504 <xsl:variable name="preceding-count" select="count($preceding)"/>
505 <xsl:variable name="outside-preceding-count" as="xs:integer">
506 <xsl:choose>
507 <xsl:when test="self::tei:s or self::tei:p">
508 <xsl:choose>
509 <xsl:when test="$preceding-count">
510 <xsl:sequence select="
511 sum(for $p in $preceding
512 return
513 count($p/descendant::*))"/>
514 </xsl:when>
515 <xsl:otherwise>
516 <xsl:sequence select="0"/>
517 </xsl:otherwise>
518 </xsl:choose>
519 </xsl:when>
520 <xsl:otherwise>
521 <xsl:sequence select="0"/>
522 </xsl:otherwise>
523 </xsl:choose>
524 </xsl:variable>
525 <xsl:variable name="my_index" select="$index + 1 + $preceding-count + $outside-preceding-count"
526 as="xs:integer"/>
527
528 <xsl:variable name="start" as="xs:integer">
529 <xsl:choose>
530 <xsl:when test="self::tei:text or self::tei:body">
531 <xsl:sequence select="0"/>
532 </xsl:when>
533 <xsl:when test="self::tei:p">
534 <xsl:variable name="first_corresp"
535 select="descendant::tei:s[1]/descendant::tei:seg[1]/attribute::corresp"
536 as="attribute(corresp)"/>
537 <xsl:variable name="numbers" select="substring-after(substring-before($first_corresp,')'),',')"/>
538 <xsl:sequence select="xs:integer(substring-before($numbers,','))"/>
539 </xsl:when>
540 <xsl:when test="self::tei:s">
541 <xsl:variable name="first_corresp"
542 select="descendant::tei:seg[1]/attribute::corresp"
543 as="attribute(corresp)"/>
544 <xsl:variable name="numbers" select="substring-after(substring-before($first_corresp,')'),',')"/>
545 <xsl:sequence select="xs:integer(substring-before($numbers,','))"/>
546 </xsl:when>
547 <!--<xsl:when test="self::tei:seg">
548 <xsl:variable name="numbers" select="substring-after(substring-before(@corresp,')'),',')"/>
549 <xsl:sequence select="xs:integer(substring-before($numbers,','))"/>
550 </xsl:when>-->
551 </xsl:choose>
552 </xsl:variable>
553 <xsl:variable name="end" as="xs:integer" select="f:calc_content_length(.)">
554 </xsl:variable>
555
556 <xsl:apply-templates mode="morpho">
557 <xsl:with-param name="ini" select="$start" as="xs:integer"/>
558 <xsl:with-param name="fin" select="$end" as="xs:integer"/>
559 <xsl:with-param name="index" select="$my_index"/>
560 <xsl:with-param name="ann_morphosyntax.xml" select="$ann_morphosyntax.xml" as="document-node()"/>
561 </xsl:apply-templates>
562 </xsl:template>
bansp5f841732022-03-16 06:27:31 +0100563
bansp3e5b20c2022-03-18 20:22:31 +0100564 <xsl:template match="tei:seg" mode="morpho">
565 <xsl:param name="ini" as="xs:integer" required="no" select="0"/>
566 <xsl:param name="fin" as="xs:integer" required="no" select="999999999"/>
567 <xsl:param name="index" as="xs:integer" required="no" select="1"/>
568 <xsl:param name="ann_morphosyntax.xml" as="document-node()"/>
569 <!-- I have made a major mess here, but it works... it's so spread out
570 because I wanted to make sure to be able to look up the individual
571 constituent values, should anything go wrong -->
572 <xsl:variable name="my_name" select="local-name()" as="xs:string"/>
573 <xsl:variable name="my_id" select="@xml:id" as="xs:string"/>
574 <xsl:variable name="my_morph-seg" as="node()" select="$ann_morphosyntax.xml//tei:seg[substring-after(@corresp,'#') eq $my_id]"/>
575 <xsl:variable name="my_disamb" select="$my_morph-seg//tei:fs/tei:f[@name eq 'disamb']" as="node()"/>
576 <xsl:variable name="my_choice-id" select="substring-after($my_disamb//tei:f[@name eq 'choice']/@fVal,'#')" as="xs:string"/>
577 <xsl:variable name="my_choice-lex" select="$my_morph-seg//tei:f[@name eq 'interps']/tei:fs[@type eq 'lex'][descendant::tei:symbol[@xml:id eq $my_choice-id]]" as="node()"/>
578 <xsl:variable name="chosen-msd" as="xs:string" select="$my_choice-lex/descendant::tei:symbol[@xml:id eq $my_choice-id]/@value"/>
579 <xsl:variable name="preceding" select="preceding-sibling::*[local-name(.) eq $my_name]"/>
580 <xsl:variable name="preceding-count" select="count($preceding)"/>
banspe726b4a2022-03-28 05:47:45 +0200581 <!--<xsl:variable name="outside-preceding-count" as="xs:integer">
bansp3e5b20c2022-03-18 20:22:31 +0100582 <xsl:choose>
banspe726b4a2022-03-28 05:47:45 +0200583 <xsl:when test="self::tei:s or self::tei:p"> <!-\- THIS NEEDS TO BE REVISITED AFTER THIS TEMPLATE HAS BECOME MORE SPECIFIC -\->
bansp3e5b20c2022-03-18 20:22:31 +0100584 <xsl:choose>
banspe726b4a2022-03-28 05:47:45 +0200585 <xsl:when test="$preceding-count"> commented out for now
bansp3e5b20c2022-03-18 20:22:31 +0100586 <xsl:sequence select="
587 sum(for $p in $preceding
588 return
589 count($p/descendant::*))"/>
590 </xsl:when>
591 <xsl:otherwise>
592 <xsl:sequence select="0"/>
593 </xsl:otherwise>
594 </xsl:choose>
595 </xsl:when>
596 <xsl:otherwise>
597 <xsl:sequence select="0"/>
598 </xsl:otherwise>
599 </xsl:choose>
banspe726b4a2022-03-28 05:47:45 +0200600 </xsl:variable>-->
601 <xsl:variable name="my_index" select="$index + 1 + $preceding-count" as="xs:integer"/>
bansp3e5b20c2022-03-18 20:22:31 +0100602
603 <xsl:variable name="start" as="xs:integer">
604 <xsl:variable name="numbers" select="substring-after(substring-before(@corresp,')'),',')"/>
605 <xsl:sequence select="xs:integer(substring-before($numbers,','))"/>
606 </xsl:variable>
607 <xsl:variable name="end" as="xs:integer" select="f:calc_content_length(.)">
608 </xsl:variable>
609 <xsl:element name="span" namespace="{$KorAP_namespace}">
610 <xsl:attribute name="id" select="'s' || $my_index"/>
611 <xsl:attribute name="from" select="$start"/>
612 <xsl:attribute name="to" select="$end"/>
613 <xsl:attribute name="l" select="f:compute_nesting(.)"/>
614 <xsl:element name="fs" namespace="http://www.tei-c.org/ns/1.0">
615 <xsl:attribute name="type" select="'lex'"/>
616 <xsl:element name="f" namespace="http://www.tei-c.org/ns/1.0">
617 <xsl:attribute name="name" select="'lex'"/>
618 <xsl:element name="fs" namespace="http://www.tei-c.org/ns/1.0">
619 <xsl:comment select="$my_morph-seg//tei:fs/tei:f[@name eq 'orth']/tei:string"/>
620
621
622 <xsl:element name="f" namespace="http://www.tei-c.org/ns/1.0">
623 <xsl:attribute name="name" select="'lemma'"/>
624 <xsl:value-of select="$my_choice-lex/tei:f[@name eq 'base']/tei:string"/>
625 </xsl:element>
626 <xsl:element name="f" namespace="http://www.tei-c.org/ns/1.0">
627 <xsl:attribute name="name" select="'pos'"/>
628 <xsl:value-of select="$my_choice-lex/tei:f[@name eq 'ctag']/tei:symbol/@value"/>
629 </xsl:element>
630 <xsl:if test="string-length($chosen-msd)">
631 <xsl:element name="f" namespace="http://www.tei-c.org/ns/1.0">
632 <xsl:attribute name="name" select="'msd'"/>
633 <xsl:value-of select="$chosen-msd"/>
634 </xsl:element>
635 </xsl:if>
636 <xsl:if test="$my_morph-seg//tei:fs/tei:f[@name eq 'nps']">
637 <xsl:element name="f" namespace="http://www.tei-c.org/ns/1.0">
638 <xsl:attribute name="name" select="'join'"/>
639 <xsl:value-of select="'left'"/>
640 </xsl:element>
641 </xsl:if>
642 </xsl:element>
643 </xsl:element>
644 </xsl:element>
645 </xsl:element>
banspe726b4a2022-03-28 05:47:45 +0200646 <xsl:apply-templates mode="morpho">
bansp3e5b20c2022-03-18 20:22:31 +0100647 <xsl:with-param name="ini" select="$start" as="xs:integer"/>
648 <xsl:with-param name="fin" select="$end" as="xs:integer"/>
649 <xsl:with-param name="index" select="$my_index"/>
650 <xsl:with-param name="ann_morphosyntax.xml" select="$ann_morphosyntax.xml" as="document-node()"/>
banspe726b4a2022-03-28 05:47:45 +0200651 </xsl:apply-templates>-->
bansp3e5b20c2022-03-18 20:22:31 +0100652 </xsl:template>
banspe726b4a2022-03-28 05:47:45 +0200653
bansp5f841732022-03-16 06:27:31 +0100654 <!-- ************************** TEXT header ******************* -->
655
656 <xsl:template name="create_text_header">
657 <xsl:param name="text.xml" as="document-node()"/>
banspe726b4a2022-03-28 05:47:45 +0200658 <xsl:param name="compoundID" as="xs:string"/>
bansp5f841732022-03-16 06:27:31 +0100659 <xsl:param name="target" as="xs:string"/>
660
661 <!-- create the local header.xml file -->
662 <xsl:result-document encoding="UTF-8" method="xml" indent="yes"
663 xpath-default-namespace="http://ids-mannheim.de/ns/KorAP" href="{$target}">
664
665 <idsHeader type="document" pattern="text" status="new" version="1.1" TEIform="teiHeader">
banspe726b4a2022-03-28 05:47:45 +0200666 <xsl:apply-templates select="$text.xml//tei:TEI/tei:teiHeader/tei:*" mode="text">
667 <xsl:with-param name="compoundID" as="xs:string" select="$compoundID" tunnel="yes"/>
668 </xsl:apply-templates>
bansp5f841732022-03-16 06:27:31 +0100669 </idsHeader>
670 </xsl:result-document>
671 </xsl:template>
672
673 <xsl:template match="tei:fileDesc" mode="text">
bansp9103aab2022-03-19 05:10:21 +0100674 <xsl:element name="{local-name()}">
bansp5f841732022-03-16 06:27:31 +0100675 <xsl:apply-templates mode="text"/>
bansp9103aab2022-03-19 05:10:21 +0100676 </xsl:element>
bansp5f841732022-03-16 06:27:31 +0100677 </xsl:template>
678
679 <xsl:template match="tei:title" mode="text">
680 <t.title>
681 <xsl:apply-templates/>
682 </t.title>
683 </xsl:template>
684
685 <xsl:template match="tei:titleStmt" mode="text">
banspe726b4a2022-03-28 05:47:45 +0200686 <xsl:param name="compoundID" as="xs:string" tunnel="yes"/>
bansp5f841732022-03-16 06:27:31 +0100687 <titleStmt>
688 <textSigle>
banspe726b4a2022-03-28 05:47:45 +0200689 <xsl:value-of select="$compoundID"/>
bansp5f841732022-03-16 06:27:31 +0100690 </textSigle>
691 <xsl:apply-templates mode="text"/>
692 </titleStmt>
693 </xsl:template>
694
bansp9103aab2022-03-19 05:10:21 +0100695 <xsl:template match="tei:publicationStmt" mode="text">
696 <xsl:element name="{local-name()}">
697 <xsl:apply-templates mode="text"/>
698 </xsl:element>
699 </xsl:template>
700
701 <xsl:template match="tei:availability" mode="text">
702 <xsl:element name="{local-name()}">
703 <xsl:apply-templates mode="text" select="@* | *"/>
704 </xsl:element>
705 </xsl:template>
706
707 <xsl:template match="tei:profileDesc" mode="text">
708 <xsl:element name="{local-name()}">
709 <xsl:apply-templates mode="text"/>
710 </xsl:element>
711 </xsl:template>
bansp5f841732022-03-16 06:27:31 +0100712
bansp9103aab2022-03-19 05:10:21 +0100713 <xsl:template match="tei:textClass" mode="text">
714 <xsl:element name="{local-name()}">
715 <xsl:apply-templates mode="text" select="@* | *"/>
716 </xsl:element>
717 </xsl:template>
718
719 <xsl:template match="tei:catRef" mode="text corpus">
720 <xsl:element name="{local-name()}">
721 <xsl:apply-templates mode="text" select="@* | *"/>
722 </xsl:element>
723 </xsl:template>
724
725 <xsl:template match="@status | @scheme | @target | @type | @xml:id[ancestor::tei:classDecl] | @xml:lang" mode="text corpus">
726 <xsl:copy-of select="."/>
727 </xsl:template>
728
729 <xsl:template match="tei:p" mode="text corpus">
730 <xsl:element name="{local-name()}">
731 <xsl:apply-templates mode="header-text"/>
732 </xsl:element>
733 </xsl:template>
734
735
736 <!-- OPTIMIZATION has to take modes into account -->
bansp5e2d1c02022-03-10 04:51:40 +0100737 <!-- ************************** CORPUS header ******************* -->
738 <xsl:template name="create_corpus_header">
739 <xsl:param name="text.xml" as="document-node()"/>
bansp5f841732022-03-16 06:27:31 +0100740 <xsl:param name="target" as="xs:string"/>
bansp5e2d1c02022-03-10 04:51:40 +0100741
742 <!-- create the corpus-level header.xml file -->
bansp5f841732022-03-16 06:27:31 +0100743 <xsl:result-document encoding="UTF-8" method="xml" indent="yes" href="{$target}">
bansp5e2d1c02022-03-10 04:51:40 +0100744
745 <!--doctype-public="{$publicDoctypeI5}"
746 doctype-system="{$systemDoctypeI5}">
747 these are, sadly, useless
748 -->
749
750 <idsHeader type="corpus" pattern="text" status="new" version="1.1" TEIform="teiHeader">
bansp9103aab2022-03-19 05:10:21 +0100751 <xsl:apply-templates select="$text.xml/tei:teiCorpus/tei:teiHeader/tei:*" mode="corpus"/>
bansp5e2d1c02022-03-10 04:51:40 +0100752 </idsHeader>
753 </xsl:result-document>
754 </xsl:template>
755
756 <xsl:template match="tei:fileDesc" mode="corpus">
bansp9103aab2022-03-19 05:10:21 +0100757 <xsl:element name="{local-name()}">
bansp5e2d1c02022-03-10 04:51:40 +0100758 <xsl:apply-templates mode="corpus"/>
bansp9103aab2022-03-19 05:10:21 +0100759 </xsl:element>
bansp5e2d1c02022-03-10 04:51:40 +0100760 </xsl:template>
bansp9103aab2022-03-19 05:10:21 +0100761
bansp5e2d1c02022-03-10 04:51:40 +0100762
763 <xsl:template match="tei:title" mode="corpus">
764 <c.title>
bansp9103aab2022-03-19 05:10:21 +0100765 <xsl:apply-templates mode="corpus" select="@*"/>
766 <xsl:apply-templates mode="header-text"/>
bansp5e2d1c02022-03-10 04:51:40 +0100767 </c.title>
768 </xsl:template>
769
770 <xsl:template match="tei:titleStmt" mode="corpus">
771 <titleStmt>
772 <korpusSigle>
773 <xsl:value-of select="$corpusID"/>
774 </korpusSigle>
775 <xsl:apply-templates mode="corpus"/>
776 </titleStmt>
777 </xsl:template>
778
bansp9103aab2022-03-19 05:10:21 +0100779 <xsl:template match="tei:publicationStmt" mode="corpus">
780 <xsl:element name="{local-name()}">
781 <xsl:apply-templates mode="corpus"/>
782 </xsl:element>
783 </xsl:template>
784
785 <xsl:template match="tei:availability" mode="corpus">
786 <xsl:element name="{local-name()}">
787 <xsl:apply-templates mode="corpus" select="@* | *"/>
788 </xsl:element>
789 </xsl:template>
790
791 <xsl:template match="tei:encodingDesc" mode="corpus">
792 <xsl:element name="{local-name()}">
793 <xsl:apply-templates mode="corpus"/>
794 </xsl:element>
795 </xsl:template>
796
797 <xsl:template match="tei:classDecl | tei:taxonomy | tei:category | tei:taxonomy/tei:bibl" mode="corpus">
798 <xsl:element name="{local-name()}">
799 <xsl:apply-templates mode="corpus" select="@* | *"/>
800 </xsl:element>
801 </xsl:template>
802
803 <xsl:template match="tei:bibl/tei:title | tei:edition | tei:desc" mode="corpus">
804 <xsl:element name="{local-name()}">
805 <xsl:apply-templates mode="corpus" select="@*"/>
806 <xsl:apply-templates mode="header-text"/>
807 </xsl:element>
808 </xsl:template>
809<!--
810 <xsl:template match="tei:textClass" mode="corpus">
811 <xsl:element name="{local-name()}">
812 <xsl:apply-templates mode="corpus" select="@* | *"/>
813 </xsl:element>
814 </xsl:template>
815
816 <xsl:template match="tei:catRef" mode="corpus">
817 <xsl:element name="{local-name()}">
818 <xsl:apply-templates mode="corpus" select="@* | *"/>
819 </xsl:element>
820 </xsl:template>
821-->
bansp5e2d1c02022-03-10 04:51:40 +0100822
823
824
825 <!-- this template can be called by the XSPEC test; TODO: find a way to call the main() template directly -->
826 <!-- I have not fully handled the param transmission, which would have to be kludged in just for the sake of XSPec,
827 because I'm disabling this for now, due to XSpec design issues; relevant links, a.o.:
828
829 https://stackoverflow.com/questions/64933277/what-is-the-cause-of-error-cannot-execute-xslresult-document-while-evaluating
830 https://www.balisage.net/Proceedings/vol25/html/Galtman01/BalisageVol25-Galtman01.html
831
832 In short: the internal design of XSpec forces kludges when one wants to use xsl:result-document in their stylesheets. But I don't
833 want to be strangled by kludges at the beginning of work, I've already lost quite a bit of time on this investigation,
834 I will therefore "just code" and then can think of externalizing bits of templates if we want to play with tests. For now,
835 I don't want to have to handle context items is a special way inside variables, etc., because I'm not sure it's worth it.
836
837 -->
838 <!--<xsl:template name="test_full">
839 <xsl:param name="corpusID"/>
840 <xsl:param name="docID"/>
841 <xsl:param name="textID"/>
842 <xsl:call-template name="xsl:initial-template"/>
843 </xsl:template>-->
844
Piotr Banski4f4c2d22022-05-19 01:44:32 +0200845 <xsl:function name="f:calc_content_length" as="xs:integer">
846 <xsl:param name="node" as="node()"/>
847 <xsl:choose>
848 <xsl:when test="$node/self::tei:text or $node/self::tei:body">
849 <xsl:variable name="last_corresp"
850 select="$node/descendant::tei:p[last()]/descendant::tei:s[last()]/descendant::tei:seg[count(@nkjp:rejected) eq 0 or @nkjp:rejected ne 'true'][last()]/attribute::corresp"
851 as="attribute(corresp)"/>
852 <xsl:variable name="numbers" select="substring-after(substring-before($last_corresp,')'),',')"/>
853 <xsl:sequence select="xs:integer(substring-before($numbers,',')) + xs:integer(substring-after($numbers,','))"/>
854 </xsl:when>
855 <xsl:when test="$node/self::tei:p">
856 <xsl:variable name="last_corresp"
857 select="$node/descendant::tei:s[last()]/descendant::tei:seg[count(@nkjp:rejected) eq 0 or @nkjp:rejected ne 'true'][last()]/attribute::corresp"
858 as="attribute(corresp)"/>
859 <xsl:variable name="numbers" select="substring-after(substring-before($last_corresp,')'),',')"/>
860 <xsl:sequence select="xs:integer(substring-before($numbers,',')) + xs:integer(substring-after($numbers,','))"/>
861 </xsl:when>
862 <xsl:when test="$node/self::tei:s">
863 <xsl:variable name="last_corresp"
864 select="$node/descendant::tei:seg[count(@nkjp:rejected) eq 0 or @nkjp:rejected ne 'true'][last()]/attribute::corresp"
865 as="attribute(corresp)"/>
866 <xsl:variable name="numbers" select="substring-after(substring-before($last_corresp,')'),',')"/>
867 <xsl:sequence select="xs:integer(substring-before($numbers,',')) + xs:integer(substring-after($numbers,','))"/>
868 </xsl:when>
869 <xsl:otherwise>
870 <xsl:variable name="numbers" select="substring-after(substring-before($node/@corresp,')'),',')"/>
871 <xsl:if test="$node/self::tei:seg and count($node/@nkjp:rejected)">
872 <!-- REMOVE THIS -->
873 <xsl:message select="$numbers"/>
874 </xsl:if>
875 <xsl:sequence select="xs:integer(substring-before($numbers,',')) + xs:integer(substring-after($numbers,','))"/>
876 </xsl:otherwise>
877 </xsl:choose>
878 </xsl:function>
879
880 <xsl:function name="f:calc_offsets" as="xs:integer+">
881 <xsl:param name="node" as="element()"/>
882 <xsl:param name="skip_start" as="xs:boolean" />
883
884 <xsl:variable name="start" as="xs:integer">
885 <xsl:choose>
886
887 <xsl:when test="$skip_start or $node/self::tei:text or $node/self::tei:body">
888 <xsl:sequence select="0"/>
889 </xsl:when>
890
891 <!-- handle p -->
892
893 <xsl:when test="$node/self::tei:p">
894 <xsl:variable name="my_pos" as="xs:integer" select="count($node/preceding-sibling::tei:p) + 1"/>
895 <xsl:variable name="preceding" as="node()*"
896 select="$node/ancestor::tei:body/tei:p[position() lt $my_pos]"/>
897
898 <xsl:choose>
899 <xsl:when test="count($preceding) eq 0">
900 <xsl:sequence select="0"/>
901 </xsl:when>
902 <xsl:otherwise>
903 <xsl:sequence select="sum(f:calc_offsets($preceding[last()],true())[$OFFSET_END],1)"/>
904
905 <!-- BUG danger: I am not sure if a "1" should rather be added after each p; let me try to handle that in the return value of the $length variable,
906 and make it sensitive to the skip_start parameter
907
908 I will then have to remove the ",1" from here!
909
910 -->
911
912 <!-- <xsl:variable name="last_corresps"
913 select="$preceding/descendant::tei:s[last()]/(descendant::tei:seg[count(@nkjp:rejected) eq 0 or @nkjp:rejected ne 'true'])[last()]/@corresp"
914 as="attribute(corresp)+"/>
915 <xsl:variable name="end_offsets" as="xs:integer+">
916 <xsl:for-each select="$last_corresps">
917 <xsl:variable name="numbers"
918 select="substring-after(substring-before(., ')'), ',')"/>
919 <xsl:sequence
920 select="xs:integer(substring-before($numbers, ',')) + xs:integer(substring-after($numbers, ','))"
921 />
922 </xsl:for-each>
923 </xsl:variable>
924 <xsl:sequence select="sum($end_offsets, 1)"/>
925
926 this is a non-recursive variant that may turn out to be much less cpu-intensive, not sure
927 - but if it's plugged in, it will have to be adjusted to the current form of the recursive variant,
928 because it hasn't been maintained since it got commented out
929 -->
930 </xsl:otherwise>
931 </xsl:choose>
932 </xsl:when>
933
934 <!-- handle s -->
935
936 <!-- the value for s gets counted since the start of the current p
937 - so we look at the preceding s's
938 + the preceding p's
939 -->
940 <xsl:when test="$node/self::tei:s">
941 <!--<xsl:variable name="last_corresp"
942 select="$node/descendant::tei:seg[count(@nkjp:rejected) eq 0 or @nkjp:rejected ne 'true'][last()]/attribute::corresp"
943 as="attribute(corresp)"/>
944 <xsl:variable name="numbers"
945 select="substring-after(substring-before($last_corresp, ')'), ',')"/>
946 <xsl:sequence
947 select="xs:integer(substring-before($numbers, ',')) + xs:integer(substring-after($numbers, ','))"
948 />
949 -->
950
951 <xsl:variable name="internal_start" as="xs:integer">
952 <xsl:variable name="my_pos" as="xs:integer" select="count($node/preceding-sibling::tei:s) + 1"/>
953 <xsl:variable name="preceding" as="node()*"
954 select="$node/ancestor::tei:p[1]/tei:s[position() lt $my_pos]"/>
955
956 <xsl:choose>
957 <xsl:when test="count($preceding) eq 0">
958 <xsl:sequence select="0"/>
959 </xsl:when>
960 <xsl:otherwise>
961 <xsl:sequence select="f:calc_offsets($preceding[last()],true())[$OFFSET_END]"/>
962 <!--<xsl:sequence select="sum(f:calc_offsets($preceding[last()],true())[$OFFSET_END],1)"/>-->
963 <!-- again, CAREFUL ABOUT THE +1, it might need to vanish -->
964 </xsl:otherwise>
965 </xsl:choose>
966 </xsl:variable>
967
968 <xsl:variable name="external_start" as="xs:integer" select="f:calc_offsets($node/ancestor::tei:p[1],false())[$OFFSET_START]"/>
969
970 <xsl:sequence select="$internal_start + $external_start"/>
971 </xsl:when>
972
973 <!-- handle seg -->
974
975 <xsl:when test="$node/self::tei:seg">
976 <!-- for segs, the s elements are irrelevant, and the local offset is immediately available on the @corresp -->
977
978 <xsl:variable name="numbers"
979 select="substring-after(substring-before($node/@corresp, ')'), ',')"/>
980
981 <xsl:variable name="internal_start" select="xs:integer(substring-before($numbers, ','))"
982 as="xs:integer"/>
983 <xsl:variable name="external_start" as="xs:integer"
984 select="f:calc_offsets($node/ancestor::tei:p[1], false())[$OFFSET_START]"/>
985
986 <xsl:if test="$node/self::tei:seg and count($node/@nkjp:rejected)">
987
988 <xsl:message select="'numbers: ' || $numbers"/>
989 </xsl:if>
990 <xsl:sequence select="$internal_start + $external_start"/>
991 </xsl:when>
992 </xsl:choose>
993 </xsl:variable>
994
995 <xsl:variable name="length" as="xs:integer">
996 <xsl:choose>
997
998 <xsl:when test="$node/self::tei:text or $node/self::tei:body">
999 <xsl:variable name="last_corresps"
1000 select="$node/descendant::tei:p/descendant::tei:s[last()]/(descendant::tei:seg[count(@nkjp:rejected) eq 0 or @nkjp:rejected ne 'true'])[last()]/@corresp"
1001 as="attribute(corresp)+"/>
1002
1003 <xsl:variable name="end_offsets" as="xs:integer+">
1004 <xsl:for-each select="$last_corresps">
1005 <xsl:variable name="numbers" select="substring-after(substring-before(., ')'), ',')"/>
1006 <xsl:sequence
1007 select="xs:integer(substring-before($numbers, ',')) + xs:integer(substring-after($numbers, ','))"
1008 />
1009 </xsl:for-each>
1010 </xsl:variable>
1011
1012 <xsl:sequence select="sum($end_offsets)"/>
1013
1014 </xsl:when>
1015 <xsl:when test="$node/self::tei:p">
1016 <xsl:variable name="last_corresps"
1017 select="$node/descendant::tei:s[last()]/(descendant::tei:seg[count(@nkjp:rejected) eq 0 or @nkjp:rejected ne 'true'])[last()]/@corresp"
1018 as="attribute(corresp)+"/>
1019 <xsl:variable name="end_offsets" as="xs:integer+">
1020 <xsl:for-each select="$last_corresps">
1021 <xsl:variable name="numbers" select="substring-after(substring-before(., ')'), ',')"/>
1022 <xsl:sequence
1023 select="xs:integer(substring-before($numbers, ',')) + xs:integer(substring-after($numbers, ','))"
1024 />
1025 </xsl:for-each>
1026 </xsl:variable>
1027 <xsl:sequence select="sum($end_offsets)"/>
1028 </xsl:when>
1029
1030
1031
1032
1033 <xsl:when test="$node/self::tei:s">
1034 <xsl:variable name="last_corresp"
1035 select="$node/descendant::tei:seg[count(@nkjp:rejected) eq 0 or @nkjp:rejected ne 'true'][last()]/attribute::corresp"
1036 as="attribute(corresp)"/>
1037 <xsl:variable name="numbers"
1038 select="substring-after(substring-before($last_corresp, ')'), ',')"/>
1039 <xsl:sequence
1040 select="xs:integer(substring-before($numbers, ',')) + xs:integer(substring-after($numbers, ','))"
1041 />
1042 </xsl:when>
1043 <xsl:otherwise>
1044 <xsl:variable name="numbers"
1045 select="substring-after(substring-before($node/@corresp, ')'), ',')"/>
1046 <xsl:if test="$node/self::tei:seg and count($node/@nkjp:rejected)">
1047 <!-- REMOVE THIS -->
1048 <xsl:message select="'rejected: ' || $numbers"/>
1049 </xsl:if>
1050 <xsl:sequence
1051 select="xs:integer(substring-before($numbers, ',')) + xs:integer(substring-after($numbers, ','))"
1052 />
1053 </xsl:otherwise>
1054 </xsl:choose>
1055 </xsl:variable>
1056
1057 <xsl:message select="local-name($node) || '[' || count($node/preceding-sibling::*[local-name() eq local-name($node)])+1 || '] length: ' || $length || ' skip_start: ' || $skip_start"/>
1058
1059 <xsl:sequence select="$start, $start + $length -1 + xs:integer($skip_start)"/>
1060 </xsl:function>
1061
1062
Akron9a8ee3e2022-01-31 13:51:49 +01001063</xsl:stylesheet>