bansp | c3cdcb9 | 2022-03-09 03:25:32 +0100 | [diff] [blame^] | 1 | <?xml version="1.0" encoding="UTF-8"?> |
| 2 | <grammar xmlns="http://relaxng.org/ns/structure/1.0" |
| 3 | datatypeLibrary="http://www.w3.org/2001/XMLSchema-datatypes" ns="http://ids-mannheim.de/ns/KorAP"> |
| 4 | <!-- $Id$ --> |
| 5 | <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">RNG schema for KorAP |
| 6 | XML metadata</documentation> |
| 7 | |
| 8 | <define name="non-document_top_content"> |
| 9 | <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">Supposed to appear in |
| 10 | both the extracted foundries (all_metadata.xml) and their exported/consolidated |
| 11 | versions</documentation> |
| 12 | <attribute name="id"> |
| 13 | <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">For foundry |
| 14 | elements, this consists of the document ID, underscore, and @nspref. In the central foundry |
| 15 | list, there are no docIDs, so it is fully redundant wrt @nspref (oh well).</documentation> |
| 16 | <data type="ID"/> |
| 17 | </attribute> |
| 18 | <optional> |
| 19 | <attribute name="dependsOn"> |
| 20 | <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">This attribute is |
| 21 | supposed to match the @nspref attribute of a foundry that the foundry in question depends on |
| 22 | (e.g., the mate foundry depends on the base foundry for tokenization, so uses @dependsOn="#base". This has to be |
| 23 | taken into account when exporting -- fragIDs have to be turned into long (potentially relative) URIs.</documentation> |
| 24 | <data type="anyURI"/> |
| 25 | </attribute> |
| 26 | </optional> |
| 27 | <optional> |
| 28 | <attribute name="restricted"> |
| 29 | <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">This attribute |
| 30 | points at wherever the information on the licensing conditions is stored. So, for the time |
| 31 | being, its actual occurrences will be given some fake URI values.</documentation> |
| 32 | <data type="anyURI"/> |
| 33 | </attribute> |
| 34 | </optional> |
| 35 | </define> |
| 36 | <!-- non-document_top_content --> |
| 37 | |
| 38 | <define name="foundry_atts"> |
| 39 | <attribute name="name"> |
| 40 | <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">This is the name of |
| 41 | the foundry, it can be longish but still has to obey the constraints on XML Names. @nspref |
| 42 | is its (possibly) shorter version, used for all sorts of referential magic.</documentation> |
| 43 | <data type="NCName"/> |
| 44 | </attribute> |
| 45 | <attribute name="nspref"> |
| 46 | <data type="NCName"/> |
| 47 | <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">This attribute |
| 48 | defines the namespace prefix that serves to identify foundries (e.g. "tt4" for (some version |
| 49 | of) TreeTagger). Note also that the value list should be open, because users will be able to |
| 50 | add their own foundries and thus to define their own prefixes. It may also be expected to be |
| 51 | shorter than foundry name. This attribute is used as part of the IDs for layers (so the 1st |
| 52 | layer in the base foundry will be ID-ed as "base_l1", and in the opennlp foundry, the ID |
| 53 | will be "onlp_l1"), and also as a reference anchor for the @dependsOn |
| 54 | attribute (via the resolution of 'long' URIs or just fragIDs, file-internally).</documentation> |
| 55 | </attribute> |
| 56 | </define> |
| 57 | <!-- foundry_atts --> |
| 58 | |
| 59 | <define name="layer"> |
| 60 | <element name="layer"> |
| 61 | <choice> |
| 62 | <attribute name="file"> |
| 63 | <data type="anyURI"/> |
| 64 | </attribute> |
| 65 | <attribute name="external"> |
| 66 | <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">This is to be |
| 67 | used in consolidated foundry metadata, to signal that the layer in question |
| 68 | (usually/always the tokenization layer) is external to the foundry. This attribute sadly |
| 69 | becomes invalid if the foundry is exported, and thus needs a special mechanism in such |
| 70 | cases (possibly, it should entail the export of the targeted layer from another (base?) |
| 71 | foundry and then it should be replaced by the appropriate @file |
| 72 | attribute)</documentation> |
| 73 | <data type="IDREF"/> |
| 74 | </attribute> |
| 75 | </choice> |
| 76 | <attribute name="id"> |
| 77 | <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">For layer |
| 78 | elements, this consists of the foundry ID, underscore, and "l" followed by a number. It is |
| 79 | obligatory because you never know whether it may be referenced from |
| 80 | outside.</documentation> |
| 81 | <data type="ID"/> |
| 82 | </attribute> |
| 83 | <optional> |
| 84 | <attribute name="contains"> |
| 85 | <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">This attribute |
| 86 | is a list of layers (in this very foundry) that this layer makes redundant; it is useful |
| 87 | for KorAP-internal indexing strategies. Whether we should be able to reference layers in |
| 88 | other foundries by URI is a matter that we leave for later (possibly something like |
| 89 | @containsURI will help us then, to make it easier to validate these simple |
| 90 | relationships)</documentation> |
| 91 | <data type="IDREFS"/> |
| 92 | </attribute> |
| 93 | </optional> |
| 94 | <attribute name="name"> |
| 95 | <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">This list should |
| 96 | in fact be open. I.e., it is useful now to restrict it, to eliminate some mismatch bugs, |
| 97 | but for production, these values should become suggestions, maybe except for |
| 98 | "token". Note also that this value is used in constructing element IDs.</documentation> |
| 99 | <choice> |
| 100 | <value type="NCName">token</value> |
| 101 | <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">tokenization; |
| 102 | the presence of the element with this name should be forced in each foundry-layer |
| 103 | metadata, but RNG on its own doesn't provide a clean way of encoding |
| 104 | that</documentation> |
| 105 | <value type="NCName">sent</value> |
| 106 | <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">sentence |
| 107 | segmentation</documentation> |
| 108 | <value type="NCName">syntax</value> |
| 109 | <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">gross syntactic |
| 110 | structure</documentation> |
| 111 | <value type="NCName">syntax-const</value> |
| 112 | <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">syntax: |
| 113 | constituent structure</documentation> |
| 114 | <value type="NCName">syntax-dep</value> |
| 115 | <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">syntax: |
| 116 | dependency relations</documentation> |
| 117 | <value type="NCName">morph</value> |
| 118 | <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">morphosyntactic |
| 119 | information</documentation> |
| 120 | <value type="NCName">phrase</value> |
| 121 | <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">phrasal |
| 122 | segmentation</documentation> |
| 123 | <value type="NCName">para</value> |
| 124 | <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">paragraph |
| 125 | segmentation</documentation> |
| 126 | <value type="NCName">aggr</value> |
| 127 | <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">'greedy' (more |
| 128 | precisely: aggressive) tokenization</documentation> |
| 129 | <value type="NCName">cons</value> |
| 130 | <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">conservative |
| 131 | ('greedy' in the regex sense) tokenization</documentation> |
| 132 | <value type="NCName">struct</value> |
| 133 | <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">structural |
| 134 | divisions in the text, highlighting info, etc.</documentation> |
| 135 | <value type="NCName">ne</value> |
| 136 | <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">named |
| 137 | entities</documentation> |
| 138 | <value type="NCName">ne_dewac</value> |
| 139 | <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">named |
| 140 | entities, dewac model for the Stanford NER</documentation> |
| 141 | <value type="NCName">ne_hgc</value> |
| 142 | <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">named |
| 143 | entities, hgc model for the Stanford NER</documentation> |
| 144 | </choice> |
| 145 | </attribute> |
| 146 | <choice> |
| 147 | <attribute name="segm"> |
| 148 | <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">granularity of |
| 149 | segmentation; possibly, this should allow a list of values, to be fully |
| 150 | flexible</documentation> |
| 151 | <choice> |
| 152 | <value type="NCName">para</value> |
| 153 | <value type="NCName">s</value> |
| 154 | <value type="NCName">chunk</value> |
| 155 | <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">The 'chunk' |
| 156 | value is meant to be a catch-all, if a more precise value can't be |
| 157 | determined</documentation> |
| 158 | <value type="NCName">tok</value> |
| 159 | </choice> |
| 160 | </attribute> |
| 161 | <attribute name="info"> |
| 162 | <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">kind of |
| 163 | information expressed by the given layer of annotation (there may, and often will, be |
| 164 | more than one)</documentation> |
| 165 | <list> |
| 166 | <oneOrMore> |
| 167 | <choice> |
| 168 | <value type="NCName">pos</value> |
| 169 | <value type="NCName">lemma</value> |
| 170 | <value type="NCName">msd</value> |
| 171 | <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">'msd' is |
| 172 | the traditional abbreviation for "morphosyntactic description", listing info on |
| 173 | e.g. tense, person, case, etc.</documentation> |
| 174 | <value type="NCName">dep</value> |
| 175 | <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">'dep' is |
| 176 | information about types of relations, used in dependency-style annotations; it is |
| 177 | an indication for the visualiser that word-to-word relationships should be |
| 178 | displayed</documentation> |
| 179 | <value type="NCName">lbl</value> |
| 180 | <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">'lbl' |
| 181 | indicates the presence of labels over dependency relations</documentation> |
| 182 | <value type="NCName">const</value> |
| 183 | <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">'const' |
| 184 | stands for 'constituency' or hierarchical, tree-based annotations; it is an |
| 185 | indication for the visualiser that it should display syntactic |
| 186 | trees</documentation> |
| 187 | <value type="NCName">cat</value> |
| 188 | <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">'cat' is |
| 189 | used for syntactic categories, as separate from pos; note that these sets need not |
| 190 | be disjoint (at the lexical level, they usually overlap), but the frontend prefers |
| 191 | to keep them separate. 'cat' will be found in the context of chunking or |
| 192 | hierarchical parsing and will characterise nodes; it may also be found in |
| 193 | dependency annotations, to indicate labels on nodes, as opposed to labels on arcs |
| 194 | (the latter are signalled by 'lbl')</documentation> |
| 195 | <value type="NCName">struct</value> |
| 196 | <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">all |
| 197 | non-linguistic information (headers, highlights, etc.)</documentation> |
| 198 | <value type="NCName">frag</value> |
| 199 | <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0" |
| 200 | >non-exhaustive coverage (when spanList/@fragmented="true")</documentation> |
| 201 | <value type="NCName">ne</value> |
| 202 | <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">named |
| 203 | entities</documentation> |
| 204 | </choice> |
| 205 | </oneOrMore> |
| 206 | </list> |
| 207 | </attribute> |
| 208 | </choice> |
| 209 | <optional> |
| 210 | <ref name="info"/> |
| 211 | </optional> |
| 212 | <zeroOrMore> |
| 213 | <element name="idx"> |
| 214 | <attribute name="name"> |
| 215 | <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">name of the element to match; if this element lacks the attribute @handle, @name is used as the handle for the index</documentation> |
| 216 | <data type="string"/> |
| 217 | </attribute> |
| 218 | <optional> |
| 219 | <attribute name="ns"> |
| 220 | <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">namespace (in DeReKo set to null, hence absent)</documentation> |
| 221 | <data type="anyURI"/> |
| 222 | </attribute> |
| 223 | </optional> |
| 224 | <optional> |
| 225 | <attribute name="key"> |
| 226 | <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">namespace (in DeReKo set to null, hence absent)</documentation> |
| 227 | <data type="NCName"/> |
| 228 | </attribute> |
| 229 | </optional> |
| 230 | <optional> |
| 231 | <attribute name="extra"> |
| 232 | <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">extra features to be turned into extra keys in the index; e.g., the @name may be "hi" and the @extra can be "rend", which causes an extra key, e.g. "rend:bold" to be associated with this span, and its payload is set to 'element:hi' to make sure about its origin; this attribute lists attribute names...</documentation> |
| 233 | <oneOrMore> |
| 234 | <text/> |
| 235 | </oneOrMore> |
| 236 | </attribute> |
| 237 | </optional> |
| 238 | <optional> |
| 239 | <ref name="fs"/> |
| 240 | </optional> |
| 241 | </element> |
| 242 | </zeroOrMore> |
| 243 | </element> |
| 244 | </define> |
| 245 | <!-- layer --> |
| 246 | |
| 247 | <define name="common_top_content"> |
| 248 | <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">For @type="foundry" |
| 249 | and @type="document". In the centralized foundry list, these values would be |
| 250 | invalid.</documentation> |
| 251 | <attribute name="docid"> |
| 252 | <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">This attribute is |
| 253 | crucial for document-level metadata, and should be the same across the header, the text, |
| 254 | and the metadata files.</documentation> |
| 255 | <data type="NCName"/> |
| 256 | </attribute> |
| 257 | <optional> |
| 258 | <attribute name="masked"> |
| 259 | <data type="boolean"/> |
| 260 | </attribute> |
| 261 | <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">This attribute |
| 262 | should only apply do document-level foundries, I think, unless we use it more generally, |
| 263 | to mark withdrawn foundries (?)</documentation> |
| 264 | </optional> |
| 265 | <element name="doc"> |
| 266 | <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">This element |
| 267 | makes it possible to create the file path to the raw text file, and the xpath to the |
| 268 | appropriate element.</documentation> |
| 269 | <attribute name="file"> |
| 270 | <data type="normalizedString"/> |
| 271 | </attribute> |
| 272 | </element> |
| 273 | <zeroOrMore> |
| 274 | <element name="binary"> |
| 275 | <attribute name="id"> |
| 276 | <data type="ID"/> |
| 277 | </attribute> |
| 278 | <attribute name="file"> |
| 279 | <data type="anyURI"/> |
| 280 | </attribute> |
| 281 | </element> |
| 282 | </zeroOrMore> |
| 283 | </define> |
| 284 | <!-- common_top_content --> |
| 285 | |
| 286 | <define name="info"> |
| 287 | <element name="info"> |
| 288 | <choice> |
| 289 | <text/> |
| 290 | <group> |
| 291 | <element name="tool"> |
| 292 | <attribute name="name"> |
| 293 | <data type="string"/> |
| 294 | </attribute> |
| 295 | <optional> |
| 296 | <attribute name="uri"> |
| 297 | <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">sometimes |
| 298 | a URI may be useful to identify the tool</documentation> |
| 299 | <data type="anyURI"/> |
| 300 | </attribute> |
| 301 | </optional> |
| 302 | <optional> |
| 303 | <attribute name="ver"> |
| 304 | <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">version |
| 305 | information</documentation> |
| 306 | <data type="string"/> |
| 307 | </attribute> |
| 308 | </optional> |
| 309 | <optional> |
| 310 | <attribute name="date"> |
| 311 | <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">date it |
| 312 | was used (can be provided in the changelog)</documentation> |
| 313 | <data type="date"/> |
| 314 | </attribute> |
| 315 | </optional> |
| 316 | <optional> |
| 317 | <attribute name="model"> |
| 318 | <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">Model |
| 319 | used to derive the output</documentation> |
| 320 | <data type="string"/> |
| 321 | </attribute> |
| 322 | </optional> |
| 323 | <optional> |
| 324 | <element name="changelog"> |
| 325 | <oneOrMore> |
| 326 | <element name="change"> |
| 327 | <attribute name="date"> |
| 328 | <data type="date"/> |
| 329 | </attribute> |
| 330 | <text/> |
| 331 | </element> |
| 332 | </oneOrMore> |
| 333 | </element> |
| 334 | </optional> |
| 335 | <optional> |
| 336 | <element name="rem"> |
| 337 | <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">for loose |
| 338 | remarks</documentation> |
| 339 | <text/> |
| 340 | </element> |
| 341 | </optional> |
| 342 | </element> |
| 343 | </group> |
| 344 | </choice> |
| 345 | <optional> |
| 346 | <element name="rem"> |
| 347 | <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">for loose |
| 348 | remarks</documentation> |
| 349 | <text/> |
| 350 | </element> |
| 351 | </optional> |
| 352 | </element> |
| 353 | </define> |
| 354 | <!-- info --> |
| 355 | |
| 356 | <start> |
| 357 | <element name="metadata"> |
| 358 | |
| 359 | <choice> |
| 360 | <group> |
| 361 | <attribute name="type"> |
| 362 | <value type="NCName">document</value> |
| 363 | </attribute> |
| 364 | <ref name="common_top_content"/> |
| 365 | <oneOrMore> |
| 366 | <element name="foundry"> |
| 367 | <ref name="foundry_atts"/> |
| 368 | <attribute name="path"> |
| 369 | <data type="normalizedString"/> |
| 370 | </attribute> |
| 371 | </element> |
| 372 | </oneOrMore> |
| 373 | </group> |
| 374 | |
| 375 | <group> |
| 376 | <attribute name="type"> |
| 377 | <value type="NCName">foundry</value> |
| 378 | </attribute> |
| 379 | <ref name="common_top_content"/> |
| 380 | <oneOrMore> |
| 381 | <element name="foundry"> |
| 382 | <ref name="non-document_top_content"/> |
| 383 | <ref name="foundry_atts"/> |
| 384 | <optional> |
| 385 | <ref name="info"/> |
| 386 | </optional> |
| 387 | <oneOrMore> |
| 388 | <ref name="layer"/> |
| 389 | </oneOrMore> |
| 390 | </element> |
| 391 | </oneOrMore> |
| 392 | </group> |
| 393 | |
| 394 | <group> |
| 395 | <attribute name="type"> |
| 396 | <value type="NCName">central</value> |
| 397 | </attribute> |
| 398 | <oneOrMore> |
| 399 | <element name="foundry"> |
| 400 | <ref name="non-document_top_content"/> |
| 401 | <ref name="foundry_atts"/> |
| 402 | <optional> |
| 403 | <ref name="info"/> |
| 404 | </optional> |
| 405 | <oneOrMore> |
| 406 | <ref name="layer"/> |
| 407 | </oneOrMore> |
| 408 | </element> |
| 409 | </oneOrMore> |
| 410 | </group> |
| 411 | <!--<group> |
| 412 | <value type="NCName">speech</value> |
| 413 | <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">for lack of a |
| 414 | better word... the idea being that this is a piece of metadata that encodes some |
| 415 | information concerning the binary stream that is decomposed "downstairs" into |
| 416 | individual speaker transcription lines</documentation> |
| 417 | </group>--> |
| 418 | |
| 419 | </choice> |
| 420 | </element> |
| 421 | </start> |
| 422 | <include href="fsr.rng"/> |
| 423 | </grammar> |