blob: 09b4fdd986dc1dfabff644839ad3f010250dfc21 [file] [log] [blame]
<?xml version="1.0" encoding="UTF-8"?>
<grammar xmlns="http://relaxng.org/ns/structure/1.0"
datatypeLibrary="http://www.w3.org/2001/XMLSchema-datatypes" ns="http://ids-mannheim.de/ns/KorAP">
<!-- $Id$ -->
<documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">RNG schema for KorAP
XML metadata</documentation>
<define name="non-document_top_content">
<documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">Supposed to appear in
both the extracted foundries (all_metadata.xml) and their exported/consolidated
versions</documentation>
<attribute name="id">
<documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">For foundry
elements, this consists of the document ID, underscore, and @nspref. In the central foundry
list, there are no docIDs, so it is fully redundant wrt @nspref (oh well).</documentation>
<data type="ID"/>
</attribute>
<optional>
<attribute name="dependsOn">
<documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">This attribute is
supposed to match the @nspref attribute of a foundry that the foundry in question depends on
(e.g., the mate foundry depends on the base foundry for tokenization, so uses @dependsOn="#base". This has to be
taken into account when exporting -- fragIDs have to be turned into long (potentially relative) URIs.</documentation>
<data type="anyURI"/>
</attribute>
</optional>
<optional>
<attribute name="restricted">
<documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">This attribute
points at wherever the information on the licensing conditions is stored. So, for the time
being, its actual occurrences will be given some fake URI values.</documentation>
<data type="anyURI"/>
</attribute>
</optional>
</define>
<!-- non-document_top_content -->
<define name="foundry_atts">
<attribute name="name">
<documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">This is the name of
the foundry, it can be longish but still has to obey the constraints on XML Names. @nspref
is its (possibly) shorter version, used for all sorts of referential magic.</documentation>
<data type="NCName"/>
</attribute>
<attribute name="nspref">
<data type="NCName"/>
<documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">This attribute
defines the namespace prefix that serves to identify foundries (e.g. "tt4" for (some version
of) TreeTagger). Note also that the value list should be open, because users will be able to
add their own foundries and thus to define their own prefixes. It may also be expected to be
shorter than foundry name. This attribute is used as part of the IDs for layers (so the 1st
layer in the base foundry will be ID-ed as "base_l1", and in the opennlp foundry, the ID
will be "onlp_l1"), and also as a reference anchor for the @dependsOn
attribute (via the resolution of 'long' URIs or just fragIDs, file-internally).</documentation>
</attribute>
</define>
<!-- foundry_atts -->
<define name="layer">
<element name="layer">
<choice>
<attribute name="file">
<data type="anyURI"/>
</attribute>
<attribute name="external">
<documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">This is to be
used in consolidated foundry metadata, to signal that the layer in question
(usually/always the tokenization layer) is external to the foundry. This attribute sadly
becomes invalid if the foundry is exported, and thus needs a special mechanism in such
cases (possibly, it should entail the export of the targeted layer from another (base?)
foundry and then it should be replaced by the appropriate @file
attribute)</documentation>
<data type="IDREF"/>
</attribute>
</choice>
<attribute name="id">
<documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">For layer
elements, this consists of the foundry ID, underscore, and "l" followed by a number. It is
obligatory because you never know whether it may be referenced from
outside.</documentation>
<data type="ID"/>
</attribute>
<optional>
<attribute name="contains">
<documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">This attribute
is a list of layers (in this very foundry) that this layer makes redundant; it is useful
for KorAP-internal indexing strategies. Whether we should be able to reference layers in
other foundries by URI is a matter that we leave for later (possibly something like
@containsURI will help us then, to make it easier to validate these simple
relationships)</documentation>
<data type="IDREFS"/>
</attribute>
</optional>
<attribute name="name">
<documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">This list should
in fact be open. I.e., it is useful now to restrict it, to eliminate some mismatch bugs,
but for production, these values should become suggestions, maybe except for
"token". Note also that this value is used in constructing element IDs.</documentation>
<choice>
<value type="NCName">token</value>
<documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">tokenization;
the presence of the element with this name should be forced in each foundry-layer
metadata, but RNG on its own doesn't provide a clean way of encoding
that</documentation>
<value type="NCName">sent</value>
<documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">sentence
segmentation</documentation>
<value type="NCName">syntax</value>
<documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">gross syntactic
structure</documentation>
<value type="NCName">syntax-const</value>
<documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">syntax:
constituent structure</documentation>
<value type="NCName">syntax-dep</value>
<documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">syntax:
dependency relations</documentation>
<value type="NCName">morph</value>
<documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">morphosyntactic
information</documentation>
<value type="NCName">phrase</value>
<documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">phrasal
segmentation</documentation>
<value type="NCName">para</value>
<documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">paragraph
segmentation</documentation>
<value type="NCName">aggr</value>
<documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">'greedy' (more
precisely: aggressive) tokenization</documentation>
<value type="NCName">cons</value>
<documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">conservative
('greedy' in the regex sense) tokenization</documentation>
<value type="NCName">struct</value>
<documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">structural
divisions in the text, highlighting info, etc.</documentation>
<value type="NCName">ne</value>
<documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">named
entities</documentation>
<value type="NCName">ne_dewac</value>
<documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">named
entities, dewac model for the Stanford NER</documentation>
<value type="NCName">ne_hgc</value>
<documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">named
entities, hgc model for the Stanford NER</documentation>
</choice>
</attribute>
<choice>
<attribute name="segm">
<documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">granularity of
segmentation; possibly, this should allow a list of values, to be fully
flexible</documentation>
<choice>
<value type="NCName">para</value>
<value type="NCName">s</value>
<value type="NCName">chunk</value>
<documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">The 'chunk'
value is meant to be a catch-all, if a more precise value can't be
determined</documentation>
<value type="NCName">tok</value>
</choice>
</attribute>
<attribute name="info">
<documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">kind of
information expressed by the given layer of annotation (there may, and often will, be
more than one)</documentation>
<list>
<oneOrMore>
<choice>
<value type="NCName">pos</value>
<value type="NCName">lemma</value>
<value type="NCName">msd</value>
<documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">'msd' is
the traditional abbreviation for "morphosyntactic description", listing info on
e.g. tense, person, case, etc.</documentation>
<value type="NCName">dep</value>
<documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">'dep' is
information about types of relations, used in dependency-style annotations; it is
an indication for the visualiser that word-to-word relationships should be
displayed</documentation>
<value type="NCName">lbl</value>
<documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">'lbl'
indicates the presence of labels over dependency relations</documentation>
<value type="NCName">const</value>
<documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">'const'
stands for 'constituency' or hierarchical, tree-based annotations; it is an
indication for the visualiser that it should display syntactic
trees</documentation>
<value type="NCName">cat</value>
<documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">'cat' is
used for syntactic categories, as separate from pos; note that these sets need not
be disjoint (at the lexical level, they usually overlap), but the frontend prefers
to keep them separate. 'cat' will be found in the context of chunking or
hierarchical parsing and will characterise nodes; it may also be found in
dependency annotations, to indicate labels on nodes, as opposed to labels on arcs
(the latter are signalled by 'lbl')</documentation>
<value type="NCName">struct</value>
<documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">all
non-linguistic information (headers, highlights, etc.)</documentation>
<value type="NCName">frag</value>
<documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0"
>non-exhaustive coverage (when spanList/@fragmented="true")</documentation>
<value type="NCName">ne</value>
<documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">named
entities</documentation>
</choice>
</oneOrMore>
</list>
</attribute>
</choice>
<optional>
<ref name="info"/>
</optional>
<zeroOrMore>
<element name="idx">
<attribute name="name">
<documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">name of the element to match; if this element lacks the attribute @handle, @name is used as the handle for the index</documentation>
<data type="string"/>
</attribute>
<optional>
<attribute name="ns">
<documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">namespace (in DeReKo set to null, hence absent)</documentation>
<data type="anyURI"/>
</attribute>
</optional>
<optional>
<attribute name="key">
<documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">namespace (in DeReKo set to null, hence absent)</documentation>
<data type="NCName"/>
</attribute>
</optional>
<optional>
<attribute name="extra">
<documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">extra features to be turned into extra keys in the index; e.g., the @name may be "hi" and the @extra can be "rend", which causes an extra key, e.g. "rend:bold" to be associated with this span, and its payload is set to 'element:hi' to make sure about its origin; this attribute lists attribute names...</documentation>
<oneOrMore>
<text/>
</oneOrMore>
</attribute>
</optional>
<optional>
<ref name="fs"/>
</optional>
</element>
</zeroOrMore>
</element>
</define>
<!-- layer -->
<define name="common_top_content">
<documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">For @type="foundry"
and @type="document". In the centralized foundry list, these values would be
invalid.</documentation>
<attribute name="docid">
<documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">This attribute is
crucial for document-level metadata, and should be the same across the header, the text,
and the metadata files.</documentation>
<data type="NCName"/>
</attribute>
<optional>
<attribute name="masked">
<data type="boolean"/>
</attribute>
<documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">This attribute
should only apply do document-level foundries, I think, unless we use it more generally,
to mark withdrawn foundries (?)</documentation>
</optional>
<element name="doc">
<documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">This element
makes it possible to create the file path to the raw text file, and the xpath to the
appropriate element.</documentation>
<attribute name="file">
<data type="normalizedString"/>
</attribute>
</element>
<zeroOrMore>
<element name="binary">
<attribute name="id">
<data type="ID"/>
</attribute>
<attribute name="file">
<data type="anyURI"/>
</attribute>
</element>
</zeroOrMore>
</define>
<!-- common_top_content -->
<define name="info">
<element name="info">
<choice>
<text/>
<group>
<element name="tool">
<attribute name="name">
<data type="string"/>
</attribute>
<optional>
<attribute name="uri">
<documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">sometimes
a URI may be useful to identify the tool</documentation>
<data type="anyURI"/>
</attribute>
</optional>
<optional>
<attribute name="ver">
<documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">version
information</documentation>
<data type="string"/>
</attribute>
</optional>
<optional>
<attribute name="date">
<documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">date it
was used (can be provided in the changelog)</documentation>
<data type="date"/>
</attribute>
</optional>
<optional>
<attribute name="model">
<documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">Model
used to derive the output</documentation>
<data type="string"/>
</attribute>
</optional>
<optional>
<element name="changelog">
<oneOrMore>
<element name="change">
<attribute name="date">
<data type="date"/>
</attribute>
<text/>
</element>
</oneOrMore>
</element>
</optional>
<optional>
<element name="rem">
<documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">for loose
remarks</documentation>
<text/>
</element>
</optional>
</element>
</group>
</choice>
<optional>
<element name="rem">
<documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">for loose
remarks</documentation>
<text/>
</element>
</optional>
</element>
</define>
<!-- info -->
<start>
<element name="metadata">
<choice>
<group>
<attribute name="type">
<value type="NCName">document</value>
</attribute>
<ref name="common_top_content"/>
<oneOrMore>
<element name="foundry">
<ref name="foundry_atts"/>
<attribute name="path">
<data type="normalizedString"/>
</attribute>
</element>
</oneOrMore>
</group>
<group>
<attribute name="type">
<value type="NCName">foundry</value>
</attribute>
<ref name="common_top_content"/>
<oneOrMore>
<element name="foundry">
<ref name="non-document_top_content"/>
<ref name="foundry_atts"/>
<optional>
<ref name="info"/>
</optional>
<oneOrMore>
<ref name="layer"/>
</oneOrMore>
</element>
</oneOrMore>
</group>
<group>
<attribute name="type">
<value type="NCName">central</value>
</attribute>
<oneOrMore>
<element name="foundry">
<ref name="non-document_top_content"/>
<ref name="foundry_atts"/>
<optional>
<ref name="info"/>
</optional>
<oneOrMore>
<ref name="layer"/>
</oneOrMore>
</element>
</oneOrMore>
</group>
<!--<group>
<value type="NCName">speech</value>
<documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">for lack of a
better word... the idea being that this is a piece of metadata that encodes some
information concerning the binary stream that is decomposed "downstairs" into
individual speaker transcription lines</documentation>
</group>-->
</choice>
</element>
</start>
<include href="fsr.rng"/>
</grammar>