add schemas and catalog
Change-Id: I271e13d4b6db999f6d56c2f923a1d7c493fee172
diff --git a/KorAP_schemas/metadata.rng b/KorAP_schemas/metadata.rng
new file mode 100644
index 0000000..09b4fdd
--- /dev/null
+++ b/KorAP_schemas/metadata.rng
@@ -0,0 +1,423 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<grammar xmlns="http://relaxng.org/ns/structure/1.0"
+ datatypeLibrary="http://www.w3.org/2001/XMLSchema-datatypes" ns="http://ids-mannheim.de/ns/KorAP">
+ <!-- $Id$ -->
+ <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">RNG schema for KorAP
+ XML metadata</documentation>
+
+ <define name="non-document_top_content">
+ <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">Supposed to appear in
+ both the extracted foundries (all_metadata.xml) and their exported/consolidated
+ versions</documentation>
+ <attribute name="id">
+ <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">For foundry
+ elements, this consists of the document ID, underscore, and @nspref. In the central foundry
+ list, there are no docIDs, so it is fully redundant wrt @nspref (oh well).</documentation>
+ <data type="ID"/>
+ </attribute>
+ <optional>
+ <attribute name="dependsOn">
+ <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">This attribute is
+ supposed to match the @nspref attribute of a foundry that the foundry in question depends on
+ (e.g., the mate foundry depends on the base foundry for tokenization, so uses @dependsOn="#base". This has to be
+ taken into account when exporting -- fragIDs have to be turned into long (potentially relative) URIs.</documentation>
+ <data type="anyURI"/>
+ </attribute>
+ </optional>
+ <optional>
+ <attribute name="restricted">
+ <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">This attribute
+ points at wherever the information on the licensing conditions is stored. So, for the time
+ being, its actual occurrences will be given some fake URI values.</documentation>
+ <data type="anyURI"/>
+ </attribute>
+ </optional>
+ </define>
+ <!-- non-document_top_content -->
+
+ <define name="foundry_atts">
+ <attribute name="name">
+ <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">This is the name of
+ the foundry, it can be longish but still has to obey the constraints on XML Names. @nspref
+ is its (possibly) shorter version, used for all sorts of referential magic.</documentation>
+ <data type="NCName"/>
+ </attribute>
+ <attribute name="nspref">
+ <data type="NCName"/>
+ <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">This attribute
+ defines the namespace prefix that serves to identify foundries (e.g. "tt4" for (some version
+ of) TreeTagger). Note also that the value list should be open, because users will be able to
+ add their own foundries and thus to define their own prefixes. It may also be expected to be
+ shorter than foundry name. This attribute is used as part of the IDs for layers (so the 1st
+ layer in the base foundry will be ID-ed as "base_l1", and in the opennlp foundry, the ID
+ will be "onlp_l1"), and also as a reference anchor for the @dependsOn
+ attribute (via the resolution of 'long' URIs or just fragIDs, file-internally).</documentation>
+ </attribute>
+ </define>
+ <!-- foundry_atts -->
+
+ <define name="layer">
+ <element name="layer">
+ <choice>
+ <attribute name="file">
+ <data type="anyURI"/>
+ </attribute>
+ <attribute name="external">
+ <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">This is to be
+ used in consolidated foundry metadata, to signal that the layer in question
+ (usually/always the tokenization layer) is external to the foundry. This attribute sadly
+ becomes invalid if the foundry is exported, and thus needs a special mechanism in such
+ cases (possibly, it should entail the export of the targeted layer from another (base?)
+ foundry and then it should be replaced by the appropriate @file
+ attribute)</documentation>
+ <data type="IDREF"/>
+ </attribute>
+ </choice>
+ <attribute name="id">
+ <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">For layer
+ elements, this consists of the foundry ID, underscore, and "l" followed by a number. It is
+ obligatory because you never know whether it may be referenced from
+ outside.</documentation>
+ <data type="ID"/>
+ </attribute>
+ <optional>
+ <attribute name="contains">
+ <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">This attribute
+ is a list of layers (in this very foundry) that this layer makes redundant; it is useful
+ for KorAP-internal indexing strategies. Whether we should be able to reference layers in
+ other foundries by URI is a matter that we leave for later (possibly something like
+ @containsURI will help us then, to make it easier to validate these simple
+ relationships)</documentation>
+ <data type="IDREFS"/>
+ </attribute>
+ </optional>
+ <attribute name="name">
+ <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">This list should
+ in fact be open. I.e., it is useful now to restrict it, to eliminate some mismatch bugs,
+ but for production, these values should become suggestions, maybe except for
+ "token". Note also that this value is used in constructing element IDs.</documentation>
+ <choice>
+ <value type="NCName">token</value>
+ <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">tokenization;
+ the presence of the element with this name should be forced in each foundry-layer
+ metadata, but RNG on its own doesn't provide a clean way of encoding
+ that</documentation>
+ <value type="NCName">sent</value>
+ <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">sentence
+ segmentation</documentation>
+ <value type="NCName">syntax</value>
+ <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">gross syntactic
+ structure</documentation>
+ <value type="NCName">syntax-const</value>
+ <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">syntax:
+ constituent structure</documentation>
+ <value type="NCName">syntax-dep</value>
+ <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">syntax:
+ dependency relations</documentation>
+ <value type="NCName">morph</value>
+ <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">morphosyntactic
+ information</documentation>
+ <value type="NCName">phrase</value>
+ <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">phrasal
+ segmentation</documentation>
+ <value type="NCName">para</value>
+ <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">paragraph
+ segmentation</documentation>
+ <value type="NCName">aggr</value>
+ <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">'greedy' (more
+ precisely: aggressive) tokenization</documentation>
+ <value type="NCName">cons</value>
+ <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">conservative
+ ('greedy' in the regex sense) tokenization</documentation>
+ <value type="NCName">struct</value>
+ <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">structural
+ divisions in the text, highlighting info, etc.</documentation>
+ <value type="NCName">ne</value>
+ <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">named
+ entities</documentation>
+ <value type="NCName">ne_dewac</value>
+ <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">named
+ entities, dewac model for the Stanford NER</documentation>
+ <value type="NCName">ne_hgc</value>
+ <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">named
+ entities, hgc model for the Stanford NER</documentation>
+ </choice>
+ </attribute>
+ <choice>
+ <attribute name="segm">
+ <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">granularity of
+ segmentation; possibly, this should allow a list of values, to be fully
+ flexible</documentation>
+ <choice>
+ <value type="NCName">para</value>
+ <value type="NCName">s</value>
+ <value type="NCName">chunk</value>
+ <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">The 'chunk'
+ value is meant to be a catch-all, if a more precise value can't be
+ determined</documentation>
+ <value type="NCName">tok</value>
+ </choice>
+ </attribute>
+ <attribute name="info">
+ <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">kind of
+ information expressed by the given layer of annotation (there may, and often will, be
+ more than one)</documentation>
+ <list>
+ <oneOrMore>
+ <choice>
+ <value type="NCName">pos</value>
+ <value type="NCName">lemma</value>
+ <value type="NCName">msd</value>
+ <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">'msd' is
+ the traditional abbreviation for "morphosyntactic description", listing info on
+ e.g. tense, person, case, etc.</documentation>
+ <value type="NCName">dep</value>
+ <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">'dep' is
+ information about types of relations, used in dependency-style annotations; it is
+ an indication for the visualiser that word-to-word relationships should be
+ displayed</documentation>
+ <value type="NCName">lbl</value>
+ <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">'lbl'
+ indicates the presence of labels over dependency relations</documentation>
+ <value type="NCName">const</value>
+ <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">'const'
+ stands for 'constituency' or hierarchical, tree-based annotations; it is an
+ indication for the visualiser that it should display syntactic
+ trees</documentation>
+ <value type="NCName">cat</value>
+ <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">'cat' is
+ used for syntactic categories, as separate from pos; note that these sets need not
+ be disjoint (at the lexical level, they usually overlap), but the frontend prefers
+ to keep them separate. 'cat' will be found in the context of chunking or
+ hierarchical parsing and will characterise nodes; it may also be found in
+ dependency annotations, to indicate labels on nodes, as opposed to labels on arcs
+ (the latter are signalled by 'lbl')</documentation>
+ <value type="NCName">struct</value>
+ <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">all
+ non-linguistic information (headers, highlights, etc.)</documentation>
+ <value type="NCName">frag</value>
+ <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0"
+ >non-exhaustive coverage (when spanList/@fragmented="true")</documentation>
+ <value type="NCName">ne</value>
+ <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">named
+ entities</documentation>
+ </choice>
+ </oneOrMore>
+ </list>
+ </attribute>
+ </choice>
+ <optional>
+ <ref name="info"/>
+ </optional>
+ <zeroOrMore>
+ <element name="idx">
+ <attribute name="name">
+ <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">name of the element to match; if this element lacks the attribute @handle, @name is used as the handle for the index</documentation>
+ <data type="string"/>
+ </attribute>
+ <optional>
+ <attribute name="ns">
+ <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">namespace (in DeReKo set to null, hence absent)</documentation>
+ <data type="anyURI"/>
+ </attribute>
+ </optional>
+ <optional>
+ <attribute name="key">
+ <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">namespace (in DeReKo set to null, hence absent)</documentation>
+ <data type="NCName"/>
+ </attribute>
+ </optional>
+ <optional>
+ <attribute name="extra">
+ <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">extra features to be turned into extra keys in the index; e.g., the @name may be "hi" and the @extra can be "rend", which causes an extra key, e.g. "rend:bold" to be associated with this span, and its payload is set to 'element:hi' to make sure about its origin; this attribute lists attribute names...</documentation>
+ <oneOrMore>
+ <text/>
+ </oneOrMore>
+ </attribute>
+ </optional>
+ <optional>
+ <ref name="fs"/>
+ </optional>
+ </element>
+ </zeroOrMore>
+ </element>
+ </define>
+ <!-- layer -->
+
+<define name="common_top_content">
+ <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">For @type="foundry"
+ and @type="document". In the centralized foundry list, these values would be
+ invalid.</documentation>
+ <attribute name="docid">
+ <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">This attribute is
+ crucial for document-level metadata, and should be the same across the header, the text,
+ and the metadata files.</documentation>
+ <data type="NCName"/>
+ </attribute>
+ <optional>
+ <attribute name="masked">
+ <data type="boolean"/>
+ </attribute>
+ <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">This attribute
+ should only apply do document-level foundries, I think, unless we use it more generally,
+ to mark withdrawn foundries (?)</documentation>
+ </optional>
+ <element name="doc">
+ <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">This element
+ makes it possible to create the file path to the raw text file, and the xpath to the
+ appropriate element.</documentation>
+ <attribute name="file">
+ <data type="normalizedString"/>
+ </attribute>
+ </element>
+ <zeroOrMore>
+ <element name="binary">
+ <attribute name="id">
+ <data type="ID"/>
+ </attribute>
+ <attribute name="file">
+ <data type="anyURI"/>
+ </attribute>
+ </element>
+ </zeroOrMore>
+</define>
+ <!-- common_top_content -->
+
+ <define name="info">
+ <element name="info">
+ <choice>
+ <text/>
+ <group>
+ <element name="tool">
+ <attribute name="name">
+ <data type="string"/>
+ </attribute>
+ <optional>
+ <attribute name="uri">
+ <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">sometimes
+ a URI may be useful to identify the tool</documentation>
+ <data type="anyURI"/>
+ </attribute>
+ </optional>
+ <optional>
+ <attribute name="ver">
+ <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">version
+ information</documentation>
+ <data type="string"/>
+ </attribute>
+ </optional>
+ <optional>
+ <attribute name="date">
+ <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">date it
+ was used (can be provided in the changelog)</documentation>
+ <data type="date"/>
+ </attribute>
+ </optional>
+ <optional>
+ <attribute name="model">
+ <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">Model
+ used to derive the output</documentation>
+ <data type="string"/>
+ </attribute>
+ </optional>
+ <optional>
+ <element name="changelog">
+ <oneOrMore>
+ <element name="change">
+ <attribute name="date">
+ <data type="date"/>
+ </attribute>
+ <text/>
+ </element>
+ </oneOrMore>
+ </element>
+ </optional>
+ <optional>
+ <element name="rem">
+ <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">for loose
+ remarks</documentation>
+ <text/>
+ </element>
+ </optional>
+ </element>
+ </group>
+ </choice>
+ <optional>
+ <element name="rem">
+ <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">for loose
+ remarks</documentation>
+ <text/>
+ </element>
+ </optional>
+ </element>
+ </define>
+ <!-- info -->
+
+ <start>
+ <element name="metadata">
+
+ <choice>
+ <group>
+ <attribute name="type">
+ <value type="NCName">document</value>
+ </attribute>
+ <ref name="common_top_content"/>
+ <oneOrMore>
+ <element name="foundry">
+ <ref name="foundry_atts"/>
+ <attribute name="path">
+ <data type="normalizedString"/>
+ </attribute>
+ </element>
+ </oneOrMore>
+ </group>
+
+ <group>
+ <attribute name="type">
+ <value type="NCName">foundry</value>
+ </attribute>
+ <ref name="common_top_content"/>
+ <oneOrMore>
+ <element name="foundry">
+ <ref name="non-document_top_content"/>
+ <ref name="foundry_atts"/>
+ <optional>
+ <ref name="info"/>
+ </optional>
+ <oneOrMore>
+ <ref name="layer"/>
+ </oneOrMore>
+ </element>
+ </oneOrMore>
+ </group>
+
+ <group>
+ <attribute name="type">
+ <value type="NCName">central</value>
+ </attribute>
+ <oneOrMore>
+ <element name="foundry">
+ <ref name="non-document_top_content"/>
+ <ref name="foundry_atts"/>
+ <optional>
+ <ref name="info"/>
+ </optional>
+ <oneOrMore>
+ <ref name="layer"/>
+ </oneOrMore>
+ </element>
+ </oneOrMore>
+ </group>
+ <!--<group>
+ <value type="NCName">speech</value>
+ <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">for lack of a
+ better word... the idea being that this is a piece of metadata that encodes some
+ information concerning the binary stream that is decomposed "downstairs" into
+ individual speaker transcription lines</documentation>
+ </group>-->
+
+ </choice>
+ </element>
+ </start>
+ <include href="fsr.rng"/>
+</grammar>