| <?xml version="1.0" encoding="UTF-8"?> |
| <grammar xmlns="http://relaxng.org/ns/structure/1.0" |
| datatypeLibrary="http://www.w3.org/2001/XMLSchema-datatypes" ns="http://ids-mannheim.de/ns/KorAP"> |
| <!-- $Id$ --> |
| <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">RNG schema for KorAP |
| XML metadata</documentation> |
| |
| <define name="non-document_top_content"> |
| <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">Supposed to appear in |
| both the extracted foundries (all_metadata.xml) and their exported/consolidated |
| versions</documentation> |
| <attribute name="id"> |
| <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">For foundry |
| elements, this consists of the document ID, underscore, and @nspref. In the central foundry |
| list, there are no docIDs, so it is fully redundant wrt @nspref (oh well).</documentation> |
| <data type="ID"/> |
| </attribute> |
| <optional> |
| <attribute name="dependsOn"> |
| <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">This attribute is |
| supposed to match the @nspref attribute of a foundry that the foundry in question depends on |
| (e.g., the mate foundry depends on the base foundry for tokenization, so uses @dependsOn="#base". This has to be |
| taken into account when exporting -- fragIDs have to be turned into long (potentially relative) URIs.</documentation> |
| <data type="anyURI"/> |
| </attribute> |
| </optional> |
| <optional> |
| <attribute name="restricted"> |
| <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">This attribute |
| points at wherever the information on the licensing conditions is stored. So, for the time |
| being, its actual occurrences will be given some fake URI values.</documentation> |
| <data type="anyURI"/> |
| </attribute> |
| </optional> |
| </define> |
| <!-- non-document_top_content --> |
| |
| <define name="foundry_atts"> |
| <attribute name="name"> |
| <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">This is the name of |
| the foundry, it can be longish but still has to obey the constraints on XML Names. @nspref |
| is its (possibly) shorter version, used for all sorts of referential magic.</documentation> |
| <data type="NCName"/> |
| </attribute> |
| <attribute name="nspref"> |
| <data type="NCName"/> |
| <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">This attribute |
| defines the namespace prefix that serves to identify foundries (e.g. "tt4" for (some version |
| of) TreeTagger). Note also that the value list should be open, because users will be able to |
| add their own foundries and thus to define their own prefixes. It may also be expected to be |
| shorter than foundry name. This attribute is used as part of the IDs for layers (so the 1st |
| layer in the base foundry will be ID-ed as "base_l1", and in the opennlp foundry, the ID |
| will be "onlp_l1"), and also as a reference anchor for the @dependsOn |
| attribute (via the resolution of 'long' URIs or just fragIDs, file-internally).</documentation> |
| </attribute> |
| </define> |
| <!-- foundry_atts --> |
| |
| <define name="layer"> |
| <element name="layer"> |
| <choice> |
| <attribute name="file"> |
| <data type="anyURI"/> |
| </attribute> |
| <attribute name="external"> |
| <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">This is to be |
| used in consolidated foundry metadata, to signal that the layer in question |
| (usually/always the tokenization layer) is external to the foundry. This attribute sadly |
| becomes invalid if the foundry is exported, and thus needs a special mechanism in such |
| cases (possibly, it should entail the export of the targeted layer from another (base?) |
| foundry and then it should be replaced by the appropriate @file |
| attribute)</documentation> |
| <data type="IDREF"/> |
| </attribute> |
| </choice> |
| <attribute name="id"> |
| <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">For layer |
| elements, this consists of the foundry ID, underscore, and "l" followed by a number. It is |
| obligatory because you never know whether it may be referenced from |
| outside.</documentation> |
| <data type="ID"/> |
| </attribute> |
| <optional> |
| <attribute name="contains"> |
| <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">This attribute |
| is a list of layers (in this very foundry) that this layer makes redundant; it is useful |
| for KorAP-internal indexing strategies. Whether we should be able to reference layers in |
| other foundries by URI is a matter that we leave for later (possibly something like |
| @containsURI will help us then, to make it easier to validate these simple |
| relationships)</documentation> |
| <data type="IDREFS"/> |
| </attribute> |
| </optional> |
| <attribute name="name"> |
| <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">This list should |
| in fact be open. I.e., it is useful now to restrict it, to eliminate some mismatch bugs, |
| but for production, these values should become suggestions, maybe except for |
| "token". Note also that this value is used in constructing element IDs.</documentation> |
| <choice> |
| <value type="NCName">token</value> |
| <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">tokenization; |
| the presence of the element with this name should be forced in each foundry-layer |
| metadata, but RNG on its own doesn't provide a clean way of encoding |
| that</documentation> |
| <value type="NCName">sent</value> |
| <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">sentence |
| segmentation</documentation> |
| <value type="NCName">syntax</value> |
| <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">gross syntactic |
| structure</documentation> |
| <value type="NCName">syntax-const</value> |
| <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">syntax: |
| constituent structure</documentation> |
| <value type="NCName">syntax-dep</value> |
| <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">syntax: |
| dependency relations</documentation> |
| <value type="NCName">morph</value> |
| <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">morphosyntactic |
| information</documentation> |
| <value type="NCName">phrase</value> |
| <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">phrasal |
| segmentation</documentation> |
| <value type="NCName">para</value> |
| <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">paragraph |
| segmentation</documentation> |
| <value type="NCName">aggr</value> |
| <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">'greedy' (more |
| precisely: aggressive) tokenization</documentation> |
| <value type="NCName">cons</value> |
| <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">conservative |
| ('greedy' in the regex sense) tokenization</documentation> |
| <value type="NCName">struct</value> |
| <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">structural |
| divisions in the text, highlighting info, etc.</documentation> |
| <value type="NCName">ne</value> |
| <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">named |
| entities</documentation> |
| <value type="NCName">ne_dewac</value> |
| <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">named |
| entities, dewac model for the Stanford NER</documentation> |
| <value type="NCName">ne_hgc</value> |
| <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">named |
| entities, hgc model for the Stanford NER</documentation> |
| </choice> |
| </attribute> |
| <choice> |
| <attribute name="segm"> |
| <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">granularity of |
| segmentation; possibly, this should allow a list of values, to be fully |
| flexible</documentation> |
| <choice> |
| <value type="NCName">para</value> |
| <value type="NCName">s</value> |
| <value type="NCName">chunk</value> |
| <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">The 'chunk' |
| value is meant to be a catch-all, if a more precise value can't be |
| determined</documentation> |
| <value type="NCName">tok</value> |
| </choice> |
| </attribute> |
| <attribute name="info"> |
| <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">kind of |
| information expressed by the given layer of annotation (there may, and often will, be |
| more than one)</documentation> |
| <list> |
| <oneOrMore> |
| <choice> |
| <value type="NCName">pos</value> |
| <value type="NCName">lemma</value> |
| <value type="NCName">msd</value> |
| <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">'msd' is |
| the traditional abbreviation for "morphosyntactic description", listing info on |
| e.g. tense, person, case, etc.</documentation> |
| <value type="NCName">dep</value> |
| <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">'dep' is |
| information about types of relations, used in dependency-style annotations; it is |
| an indication for the visualiser that word-to-word relationships should be |
| displayed</documentation> |
| <value type="NCName">lbl</value> |
| <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">'lbl' |
| indicates the presence of labels over dependency relations</documentation> |
| <value type="NCName">const</value> |
| <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">'const' |
| stands for 'constituency' or hierarchical, tree-based annotations; it is an |
| indication for the visualiser that it should display syntactic |
| trees</documentation> |
| <value type="NCName">cat</value> |
| <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">'cat' is |
| used for syntactic categories, as separate from pos; note that these sets need not |
| be disjoint (at the lexical level, they usually overlap), but the frontend prefers |
| to keep them separate. 'cat' will be found in the context of chunking or |
| hierarchical parsing and will characterise nodes; it may also be found in |
| dependency annotations, to indicate labels on nodes, as opposed to labels on arcs |
| (the latter are signalled by 'lbl')</documentation> |
| <value type="NCName">struct</value> |
| <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">all |
| non-linguistic information (headers, highlights, etc.)</documentation> |
| <value type="NCName">frag</value> |
| <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0" |
| >non-exhaustive coverage (when spanList/@fragmented="true")</documentation> |
| <value type="NCName">ne</value> |
| <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">named |
| entities</documentation> |
| </choice> |
| </oneOrMore> |
| </list> |
| </attribute> |
| </choice> |
| <optional> |
| <ref name="info"/> |
| </optional> |
| <zeroOrMore> |
| <element name="idx"> |
| <attribute name="name"> |
| <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">name of the element to match; if this element lacks the attribute @handle, @name is used as the handle for the index</documentation> |
| <data type="string"/> |
| </attribute> |
| <optional> |
| <attribute name="ns"> |
| <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">namespace (in DeReKo set to null, hence absent)</documentation> |
| <data type="anyURI"/> |
| </attribute> |
| </optional> |
| <optional> |
| <attribute name="key"> |
| <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">namespace (in DeReKo set to null, hence absent)</documentation> |
| <data type="NCName"/> |
| </attribute> |
| </optional> |
| <optional> |
| <attribute name="extra"> |
| <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">extra features to be turned into extra keys in the index; e.g., the @name may be "hi" and the @extra can be "rend", which causes an extra key, e.g. "rend:bold" to be associated with this span, and its payload is set to 'element:hi' to make sure about its origin; this attribute lists attribute names...</documentation> |
| <oneOrMore> |
| <text/> |
| </oneOrMore> |
| </attribute> |
| </optional> |
| <optional> |
| <ref name="fs"/> |
| </optional> |
| </element> |
| </zeroOrMore> |
| </element> |
| </define> |
| <!-- layer --> |
| |
| <define name="common_top_content"> |
| <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">For @type="foundry" |
| and @type="document". In the centralized foundry list, these values would be |
| invalid.</documentation> |
| <attribute name="docid"> |
| <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">This attribute is |
| crucial for document-level metadata, and should be the same across the header, the text, |
| and the metadata files.</documentation> |
| <data type="NCName"/> |
| </attribute> |
| <optional> |
| <attribute name="masked"> |
| <data type="boolean"/> |
| </attribute> |
| <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">This attribute |
| should only apply do document-level foundries, I think, unless we use it more generally, |
| to mark withdrawn foundries (?)</documentation> |
| </optional> |
| <element name="doc"> |
| <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">This element |
| makes it possible to create the file path to the raw text file, and the xpath to the |
| appropriate element.</documentation> |
| <attribute name="file"> |
| <data type="normalizedString"/> |
| </attribute> |
| </element> |
| <zeroOrMore> |
| <element name="binary"> |
| <attribute name="id"> |
| <data type="ID"/> |
| </attribute> |
| <attribute name="file"> |
| <data type="anyURI"/> |
| </attribute> |
| </element> |
| </zeroOrMore> |
| </define> |
| <!-- common_top_content --> |
| |
| <define name="info"> |
| <element name="info"> |
| <choice> |
| <text/> |
| <group> |
| <element name="tool"> |
| <attribute name="name"> |
| <data type="string"/> |
| </attribute> |
| <optional> |
| <attribute name="uri"> |
| <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">sometimes |
| a URI may be useful to identify the tool</documentation> |
| <data type="anyURI"/> |
| </attribute> |
| </optional> |
| <optional> |
| <attribute name="ver"> |
| <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">version |
| information</documentation> |
| <data type="string"/> |
| </attribute> |
| </optional> |
| <optional> |
| <attribute name="date"> |
| <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">date it |
| was used (can be provided in the changelog)</documentation> |
| <data type="date"/> |
| </attribute> |
| </optional> |
| <optional> |
| <attribute name="model"> |
| <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">Model |
| used to derive the output</documentation> |
| <data type="string"/> |
| </attribute> |
| </optional> |
| <optional> |
| <element name="changelog"> |
| <oneOrMore> |
| <element name="change"> |
| <attribute name="date"> |
| <data type="date"/> |
| </attribute> |
| <text/> |
| </element> |
| </oneOrMore> |
| </element> |
| </optional> |
| <optional> |
| <element name="rem"> |
| <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">for loose |
| remarks</documentation> |
| <text/> |
| </element> |
| </optional> |
| </element> |
| </group> |
| </choice> |
| <optional> |
| <element name="rem"> |
| <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">for loose |
| remarks</documentation> |
| <text/> |
| </element> |
| </optional> |
| </element> |
| </define> |
| <!-- info --> |
| |
| <start> |
| <element name="metadata"> |
| |
| <choice> |
| <group> |
| <attribute name="type"> |
| <value type="NCName">document</value> |
| </attribute> |
| <ref name="common_top_content"/> |
| <oneOrMore> |
| <element name="foundry"> |
| <ref name="foundry_atts"/> |
| <attribute name="path"> |
| <data type="normalizedString"/> |
| </attribute> |
| </element> |
| </oneOrMore> |
| </group> |
| |
| <group> |
| <attribute name="type"> |
| <value type="NCName">foundry</value> |
| </attribute> |
| <ref name="common_top_content"/> |
| <oneOrMore> |
| <element name="foundry"> |
| <ref name="non-document_top_content"/> |
| <ref name="foundry_atts"/> |
| <optional> |
| <ref name="info"/> |
| </optional> |
| <oneOrMore> |
| <ref name="layer"/> |
| </oneOrMore> |
| </element> |
| </oneOrMore> |
| </group> |
| |
| <group> |
| <attribute name="type"> |
| <value type="NCName">central</value> |
| </attribute> |
| <oneOrMore> |
| <element name="foundry"> |
| <ref name="non-document_top_content"/> |
| <ref name="foundry_atts"/> |
| <optional> |
| <ref name="info"/> |
| </optional> |
| <oneOrMore> |
| <ref name="layer"/> |
| </oneOrMore> |
| </element> |
| </oneOrMore> |
| </group> |
| <!--<group> |
| <value type="NCName">speech</value> |
| <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">for lack of a |
| better word... the idea being that this is a piece of metadata that encodes some |
| information concerning the binary stream that is decomposed "downstairs" into |
| individual speaker transcription lines</documentation> |
| </group>--> |
| |
| </choice> |
| </element> |
| </start> |
| <include href="fsr.rng"/> |
| </grammar> |