blob: 09b4fdd986dc1dfabff644839ad3f010250dfc21 [file] [log] [blame]
banspc3cdcb92022-03-09 03:25:32 +01001<?xml version="1.0" encoding="UTF-8"?>
2<grammar xmlns="http://relaxng.org/ns/structure/1.0"
3 datatypeLibrary="http://www.w3.org/2001/XMLSchema-datatypes" ns="http://ids-mannheim.de/ns/KorAP">
4 <!-- $Id$ -->
5 <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">RNG schema for KorAP
6 XML metadata</documentation>
7
8 <define name="non-document_top_content">
9 <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">Supposed to appear in
10 both the extracted foundries (all_metadata.xml) and their exported/consolidated
11 versions</documentation>
12 <attribute name="id">
13 <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">For foundry
14 elements, this consists of the document ID, underscore, and @nspref. In the central foundry
15 list, there are no docIDs, so it is fully redundant wrt @nspref (oh well).</documentation>
16 <data type="ID"/>
17 </attribute>
18 <optional>
19 <attribute name="dependsOn">
20 <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">This attribute is
21 supposed to match the @nspref attribute of a foundry that the foundry in question depends on
22 (e.g., the mate foundry depends on the base foundry for tokenization, so uses @dependsOn="#base". This has to be
23 taken into account when exporting -- fragIDs have to be turned into long (potentially relative) URIs.</documentation>
24 <data type="anyURI"/>
25 </attribute>
26 </optional>
27 <optional>
28 <attribute name="restricted">
29 <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">This attribute
30 points at wherever the information on the licensing conditions is stored. So, for the time
31 being, its actual occurrences will be given some fake URI values.</documentation>
32 <data type="anyURI"/>
33 </attribute>
34 </optional>
35 </define>
36 <!-- non-document_top_content -->
37
38 <define name="foundry_atts">
39 <attribute name="name">
40 <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">This is the name of
41 the foundry, it can be longish but still has to obey the constraints on XML Names. @nspref
42 is its (possibly) shorter version, used for all sorts of referential magic.</documentation>
43 <data type="NCName"/>
44 </attribute>
45 <attribute name="nspref">
46 <data type="NCName"/>
47 <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">This attribute
48 defines the namespace prefix that serves to identify foundries (e.g. "tt4" for (some version
49 of) TreeTagger). Note also that the value list should be open, because users will be able to
50 add their own foundries and thus to define their own prefixes. It may also be expected to be
51 shorter than foundry name. This attribute is used as part of the IDs for layers (so the 1st
52 layer in the base foundry will be ID-ed as "base_l1", and in the opennlp foundry, the ID
53 will be "onlp_l1"), and also as a reference anchor for the @dependsOn
54 attribute (via the resolution of 'long' URIs or just fragIDs, file-internally).</documentation>
55 </attribute>
56 </define>
57 <!-- foundry_atts -->
58
59 <define name="layer">
60 <element name="layer">
61 <choice>
62 <attribute name="file">
63 <data type="anyURI"/>
64 </attribute>
65 <attribute name="external">
66 <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">This is to be
67 used in consolidated foundry metadata, to signal that the layer in question
68 (usually/always the tokenization layer) is external to the foundry. This attribute sadly
69 becomes invalid if the foundry is exported, and thus needs a special mechanism in such
70 cases (possibly, it should entail the export of the targeted layer from another (base?)
71 foundry and then it should be replaced by the appropriate @file
72 attribute)</documentation>
73 <data type="IDREF"/>
74 </attribute>
75 </choice>
76 <attribute name="id">
77 <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">For layer
78 elements, this consists of the foundry ID, underscore, and "l" followed by a number. It is
79 obligatory because you never know whether it may be referenced from
80 outside.</documentation>
81 <data type="ID"/>
82 </attribute>
83 <optional>
84 <attribute name="contains">
85 <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">This attribute
86 is a list of layers (in this very foundry) that this layer makes redundant; it is useful
87 for KorAP-internal indexing strategies. Whether we should be able to reference layers in
88 other foundries by URI is a matter that we leave for later (possibly something like
89 @containsURI will help us then, to make it easier to validate these simple
90 relationships)</documentation>
91 <data type="IDREFS"/>
92 </attribute>
93 </optional>
94 <attribute name="name">
95 <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">This list should
96 in fact be open. I.e., it is useful now to restrict it, to eliminate some mismatch bugs,
97 but for production, these values should become suggestions, maybe except for
98 "token". Note also that this value is used in constructing element IDs.</documentation>
99 <choice>
100 <value type="NCName">token</value>
101 <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">tokenization;
102 the presence of the element with this name should be forced in each foundry-layer
103 metadata, but RNG on its own doesn't provide a clean way of encoding
104 that</documentation>
105 <value type="NCName">sent</value>
106 <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">sentence
107 segmentation</documentation>
108 <value type="NCName">syntax</value>
109 <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">gross syntactic
110 structure</documentation>
111 <value type="NCName">syntax-const</value>
112 <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">syntax:
113 constituent structure</documentation>
114 <value type="NCName">syntax-dep</value>
115 <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">syntax:
116 dependency relations</documentation>
117 <value type="NCName">morph</value>
118 <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">morphosyntactic
119 information</documentation>
120 <value type="NCName">phrase</value>
121 <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">phrasal
122 segmentation</documentation>
123 <value type="NCName">para</value>
124 <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">paragraph
125 segmentation</documentation>
126 <value type="NCName">aggr</value>
127 <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">'greedy' (more
128 precisely: aggressive) tokenization</documentation>
129 <value type="NCName">cons</value>
130 <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">conservative
131 ('greedy' in the regex sense) tokenization</documentation>
132 <value type="NCName">struct</value>
133 <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">structural
134 divisions in the text, highlighting info, etc.</documentation>
135 <value type="NCName">ne</value>
136 <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">named
137 entities</documentation>
138 <value type="NCName">ne_dewac</value>
139 <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">named
140 entities, dewac model for the Stanford NER</documentation>
141 <value type="NCName">ne_hgc</value>
142 <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">named
143 entities, hgc model for the Stanford NER</documentation>
144 </choice>
145 </attribute>
146 <choice>
147 <attribute name="segm">
148 <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">granularity of
149 segmentation; possibly, this should allow a list of values, to be fully
150 flexible</documentation>
151 <choice>
152 <value type="NCName">para</value>
153 <value type="NCName">s</value>
154 <value type="NCName">chunk</value>
155 <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">The 'chunk'
156 value is meant to be a catch-all, if a more precise value can't be
157 determined</documentation>
158 <value type="NCName">tok</value>
159 </choice>
160 </attribute>
161 <attribute name="info">
162 <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">kind of
163 information expressed by the given layer of annotation (there may, and often will, be
164 more than one)</documentation>
165 <list>
166 <oneOrMore>
167 <choice>
168 <value type="NCName">pos</value>
169 <value type="NCName">lemma</value>
170 <value type="NCName">msd</value>
171 <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">'msd' is
172 the traditional abbreviation for "morphosyntactic description", listing info on
173 e.g. tense, person, case, etc.</documentation>
174 <value type="NCName">dep</value>
175 <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">'dep' is
176 information about types of relations, used in dependency-style annotations; it is
177 an indication for the visualiser that word-to-word relationships should be
178 displayed</documentation>
179 <value type="NCName">lbl</value>
180 <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">'lbl'
181 indicates the presence of labels over dependency relations</documentation>
182 <value type="NCName">const</value>
183 <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">'const'
184 stands for 'constituency' or hierarchical, tree-based annotations; it is an
185 indication for the visualiser that it should display syntactic
186 trees</documentation>
187 <value type="NCName">cat</value>
188 <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">'cat' is
189 used for syntactic categories, as separate from pos; note that these sets need not
190 be disjoint (at the lexical level, they usually overlap), but the frontend prefers
191 to keep them separate. 'cat' will be found in the context of chunking or
192 hierarchical parsing and will characterise nodes; it may also be found in
193 dependency annotations, to indicate labels on nodes, as opposed to labels on arcs
194 (the latter are signalled by 'lbl')</documentation>
195 <value type="NCName">struct</value>
196 <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">all
197 non-linguistic information (headers, highlights, etc.)</documentation>
198 <value type="NCName">frag</value>
199 <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0"
200 >non-exhaustive coverage (when spanList/@fragmented="true")</documentation>
201 <value type="NCName">ne</value>
202 <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">named
203 entities</documentation>
204 </choice>
205 </oneOrMore>
206 </list>
207 </attribute>
208 </choice>
209 <optional>
210 <ref name="info"/>
211 </optional>
212 <zeroOrMore>
213 <element name="idx">
214 <attribute name="name">
215 <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">name of the element to match; if this element lacks the attribute @handle, @name is used as the handle for the index</documentation>
216 <data type="string"/>
217 </attribute>
218 <optional>
219 <attribute name="ns">
220 <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">namespace (in DeReKo set to null, hence absent)</documentation>
221 <data type="anyURI"/>
222 </attribute>
223 </optional>
224 <optional>
225 <attribute name="key">
226 <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">namespace (in DeReKo set to null, hence absent)</documentation>
227 <data type="NCName"/>
228 </attribute>
229 </optional>
230 <optional>
231 <attribute name="extra">
232 <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">extra features to be turned into extra keys in the index; e.g., the @name may be "hi" and the @extra can be "rend", which causes an extra key, e.g. "rend:bold" to be associated with this span, and its payload is set to 'element:hi' to make sure about its origin; this attribute lists attribute names...</documentation>
233 <oneOrMore>
234 <text/>
235 </oneOrMore>
236 </attribute>
237 </optional>
238 <optional>
239 <ref name="fs"/>
240 </optional>
241 </element>
242 </zeroOrMore>
243 </element>
244 </define>
245 <!-- layer -->
246
247<define name="common_top_content">
248 <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">For @type="foundry"
249 and @type="document". In the centralized foundry list, these values would be
250 invalid.</documentation>
251 <attribute name="docid">
252 <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">This attribute is
253 crucial for document-level metadata, and should be the same across the header, the text,
254 and the metadata files.</documentation>
255 <data type="NCName"/>
256 </attribute>
257 <optional>
258 <attribute name="masked">
259 <data type="boolean"/>
260 </attribute>
261 <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">This attribute
262 should only apply do document-level foundries, I think, unless we use it more generally,
263 to mark withdrawn foundries (?)</documentation>
264 </optional>
265 <element name="doc">
266 <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">This element
267 makes it possible to create the file path to the raw text file, and the xpath to the
268 appropriate element.</documentation>
269 <attribute name="file">
270 <data type="normalizedString"/>
271 </attribute>
272 </element>
273 <zeroOrMore>
274 <element name="binary">
275 <attribute name="id">
276 <data type="ID"/>
277 </attribute>
278 <attribute name="file">
279 <data type="anyURI"/>
280 </attribute>
281 </element>
282 </zeroOrMore>
283</define>
284 <!-- common_top_content -->
285
286 <define name="info">
287 <element name="info">
288 <choice>
289 <text/>
290 <group>
291 <element name="tool">
292 <attribute name="name">
293 <data type="string"/>
294 </attribute>
295 <optional>
296 <attribute name="uri">
297 <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">sometimes
298 a URI may be useful to identify the tool</documentation>
299 <data type="anyURI"/>
300 </attribute>
301 </optional>
302 <optional>
303 <attribute name="ver">
304 <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">version
305 information</documentation>
306 <data type="string"/>
307 </attribute>
308 </optional>
309 <optional>
310 <attribute name="date">
311 <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">date it
312 was used (can be provided in the changelog)</documentation>
313 <data type="date"/>
314 </attribute>
315 </optional>
316 <optional>
317 <attribute name="model">
318 <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">Model
319 used to derive the output</documentation>
320 <data type="string"/>
321 </attribute>
322 </optional>
323 <optional>
324 <element name="changelog">
325 <oneOrMore>
326 <element name="change">
327 <attribute name="date">
328 <data type="date"/>
329 </attribute>
330 <text/>
331 </element>
332 </oneOrMore>
333 </element>
334 </optional>
335 <optional>
336 <element name="rem">
337 <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">for loose
338 remarks</documentation>
339 <text/>
340 </element>
341 </optional>
342 </element>
343 </group>
344 </choice>
345 <optional>
346 <element name="rem">
347 <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">for loose
348 remarks</documentation>
349 <text/>
350 </element>
351 </optional>
352 </element>
353 </define>
354 <!-- info -->
355
356 <start>
357 <element name="metadata">
358
359 <choice>
360 <group>
361 <attribute name="type">
362 <value type="NCName">document</value>
363 </attribute>
364 <ref name="common_top_content"/>
365 <oneOrMore>
366 <element name="foundry">
367 <ref name="foundry_atts"/>
368 <attribute name="path">
369 <data type="normalizedString"/>
370 </attribute>
371 </element>
372 </oneOrMore>
373 </group>
374
375 <group>
376 <attribute name="type">
377 <value type="NCName">foundry</value>
378 </attribute>
379 <ref name="common_top_content"/>
380 <oneOrMore>
381 <element name="foundry">
382 <ref name="non-document_top_content"/>
383 <ref name="foundry_atts"/>
384 <optional>
385 <ref name="info"/>
386 </optional>
387 <oneOrMore>
388 <ref name="layer"/>
389 </oneOrMore>
390 </element>
391 </oneOrMore>
392 </group>
393
394 <group>
395 <attribute name="type">
396 <value type="NCName">central</value>
397 </attribute>
398 <oneOrMore>
399 <element name="foundry">
400 <ref name="non-document_top_content"/>
401 <ref name="foundry_atts"/>
402 <optional>
403 <ref name="info"/>
404 </optional>
405 <oneOrMore>
406 <ref name="layer"/>
407 </oneOrMore>
408 </element>
409 </oneOrMore>
410 </group>
411 <!--<group>
412 <value type="NCName">speech</value>
413 <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">for lack of a
414 better word... the idea being that this is a piece of metadata that encodes some
415 information concerning the binary stream that is decomposed "downstairs" into
416 individual speaker transcription lines</documentation>
417 </group>-->
418
419 </choice>
420 </element>
421 </start>
422 <include href="fsr.rng"/>
423</grammar>