add schemas and catalog

Change-Id: I271e13d4b6db999f6d56c2f923a1d7c493fee172
diff --git a/KorAP_schemas/metadata.rng b/KorAP_schemas/metadata.rng
new file mode 100644
index 0000000..09b4fdd
--- /dev/null
+++ b/KorAP_schemas/metadata.rng
@@ -0,0 +1,423 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<grammar xmlns="http://relaxng.org/ns/structure/1.0"
+  datatypeLibrary="http://www.w3.org/2001/XMLSchema-datatypes" ns="http://ids-mannheim.de/ns/KorAP">
+  <!-- $Id$ -->
+  <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">RNG schema for KorAP
+    XML metadata</documentation>
+  
+  <define name="non-document_top_content">
+    <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">Supposed to appear in
+      both the extracted foundries (all_metadata.xml) and their exported/consolidated
+      versions</documentation>
+    <attribute name="id">
+      <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">For foundry
+        elements, this consists of the document ID, underscore, and @nspref. In the central foundry
+        list, there are no docIDs, so it is fully redundant wrt @nspref (oh well).</documentation>
+      <data type="ID"/>
+    </attribute>
+    <optional>
+      <attribute name="dependsOn">
+        <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">This attribute is
+          supposed to match the @nspref attribute of a foundry that the foundry in question depends on
+          (e.g., the mate foundry depends on the base foundry for tokenization, so uses @dependsOn="#base". This has to be
+          taken into account when exporting -- fragIDs have to be turned into long (potentially relative) URIs.</documentation>
+        <data type="anyURI"/>
+      </attribute>
+    </optional>
+    <optional>
+      <attribute name="restricted">
+        <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">This attribute
+          points at wherever the information on the licensing conditions is stored. So, for the time
+          being, its actual occurrences will be given some fake URI values.</documentation>
+        <data type="anyURI"/>
+      </attribute>
+    </optional>
+  </define>
+  <!-- non-document_top_content -->
+  
+  <define name="foundry_atts">
+    <attribute name="name">
+      <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">This is the name of
+        the foundry, it can be longish but still has to obey the constraints on XML Names. @nspref
+        is its (possibly) shorter version, used for all sorts of referential magic.</documentation>
+      <data type="NCName"/>
+    </attribute>
+    <attribute name="nspref">
+      <data type="NCName"/>
+      <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">This attribute
+        defines the namespace prefix that serves to identify foundries (e.g. "tt4" for (some version
+        of) TreeTagger). Note also that the value list should be open, because users will be able to
+        add their own foundries and thus to define their own prefixes. It may also be expected to be
+        shorter than foundry name. This attribute is used as part of the IDs for layers (so the 1st
+        layer in the base foundry will be ID-ed as "base_l1", and in the opennlp foundry, the ID
+        will be "onlp_l1"), and also as a reference anchor for the @dependsOn
+        attribute (via the resolution of 'long' URIs or just fragIDs, file-internally).</documentation>
+    </attribute>
+  </define>
+  <!-- foundry_atts -->
+
+  <define name="layer">
+    <element name="layer">
+      <choice>
+        <attribute name="file">
+          <data type="anyURI"/>
+        </attribute>
+        <attribute name="external">
+          <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">This is to be
+            used in consolidated foundry metadata, to signal that the layer in question
+            (usually/always the tokenization layer) is external to the foundry. This attribute sadly
+            becomes invalid if the foundry is exported, and thus needs a special mechanism in such
+            cases (possibly, it should entail the export of the targeted layer from another (base?)
+            foundry and then it should be replaced by the appropriate @file
+            attribute)</documentation>
+          <data type="IDREF"/>
+        </attribute>
+      </choice>
+      <attribute name="id">
+        <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">For layer
+          elements, this consists of the foundry ID, underscore, and "l" followed by a number. It is
+          obligatory because you never know whether it may be referenced from
+          outside.</documentation>
+        <data type="ID"/>
+      </attribute>
+      <optional>
+        <attribute name="contains">
+          <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">This attribute
+            is a list of layers (in this very foundry) that this layer makes redundant; it is useful
+            for KorAP-internal indexing strategies. Whether we should be able to reference layers in
+            other foundries by URI is a matter that we leave for later (possibly something like
+            @containsURI will help us then, to make it easier to validate these simple
+            relationships)</documentation>
+          <data type="IDREFS"/>
+        </attribute>
+      </optional>
+      <attribute name="name">
+        <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">This list should
+          in fact be open. I.e., it is useful now to restrict it, to eliminate some mismatch bugs,
+          but for production, these values should become suggestions, maybe except for
+          "token". Note also that this value is used in constructing element IDs.</documentation>
+        <choice>
+          <value type="NCName">token</value>
+          <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">tokenization;
+            the presence of the element with this name should be forced in each foundry-layer
+            metadata, but RNG on its own doesn't provide a clean way of encoding
+            that</documentation>
+          <value type="NCName">sent</value>
+          <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">sentence
+            segmentation</documentation>
+          <value type="NCName">syntax</value>
+          <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">gross syntactic
+            structure</documentation>
+          <value type="NCName">syntax-const</value>
+          <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">syntax:
+            constituent structure</documentation>
+          <value type="NCName">syntax-dep</value>
+          <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">syntax:
+            dependency relations</documentation>
+          <value type="NCName">morph</value>
+          <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">morphosyntactic
+            information</documentation>
+          <value type="NCName">phrase</value>
+          <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">phrasal
+            segmentation</documentation>
+          <value type="NCName">para</value>
+          <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">paragraph
+            segmentation</documentation>
+          <value type="NCName">aggr</value>
+          <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">'greedy' (more
+            precisely: aggressive) tokenization</documentation>
+          <value type="NCName">cons</value>
+          <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">conservative
+            ('greedy' in the regex sense) tokenization</documentation>
+          <value type="NCName">struct</value>
+          <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">structural
+            divisions in the text, highlighting info, etc.</documentation>
+          <value type="NCName">ne</value>
+          <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">named
+            entities</documentation>
+          <value type="NCName">ne_dewac</value>
+          <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">named
+            entities, dewac model for the Stanford NER</documentation>
+          <value type="NCName">ne_hgc</value>
+          <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">named
+            entities, hgc model for the Stanford NER</documentation>
+        </choice>
+      </attribute>
+      <choice>
+        <attribute name="segm">
+          <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">granularity of
+            segmentation; possibly, this should allow a list of values, to be fully
+            flexible</documentation>
+          <choice>
+            <value type="NCName">para</value>
+            <value type="NCName">s</value>
+            <value type="NCName">chunk</value>
+            <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">The 'chunk'
+              value is meant to be a catch-all, if a more precise value can't be
+              determined</documentation>
+            <value type="NCName">tok</value>
+          </choice>
+        </attribute>
+        <attribute name="info">
+          <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">kind of
+            information expressed by the given layer of annotation (there may, and often will, be
+            more than one)</documentation>
+          <list>
+            <oneOrMore>
+              <choice>
+                <value type="NCName">pos</value>
+                <value type="NCName">lemma</value>
+                <value type="NCName">msd</value>
+                <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">'msd' is
+                  the traditional abbreviation for "morphosyntactic description", listing info on
+                  e.g. tense, person, case, etc.</documentation>
+                <value type="NCName">dep</value>
+                <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">'dep' is
+                  information about types of relations, used in dependency-style annotations; it is
+                  an indication for the visualiser that word-to-word relationships should be
+                  displayed</documentation>
+                <value type="NCName">lbl</value>
+                <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">'lbl'
+                  indicates the presence of labels over dependency relations</documentation>
+                <value type="NCName">const</value>
+                <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">'const'
+                  stands for 'constituency' or hierarchical, tree-based annotations; it is an
+                  indication for the visualiser that it should display syntactic
+                  trees</documentation>
+                <value type="NCName">cat</value>
+                <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">'cat' is
+                  used for syntactic categories, as separate from pos; note that these sets need not
+                  be disjoint (at the lexical level, they usually overlap), but the frontend prefers
+                  to keep them separate. 'cat' will be found in the context of chunking or
+                  hierarchical parsing and will characterise nodes; it may also be found in
+                  dependency annotations, to indicate labels on nodes, as opposed to labels on arcs
+                  (the latter are signalled by 'lbl')</documentation>
+                <value type="NCName">struct</value>
+                <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">all
+                  non-linguistic information (headers, highlights, etc.)</documentation>
+                <value type="NCName">frag</value>
+                <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0"
+                  >non-exhaustive coverage (when spanList/@fragmented="true")</documentation>
+                <value type="NCName">ne</value>
+                <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">named
+                  entities</documentation>
+              </choice>
+            </oneOrMore>
+          </list>
+        </attribute>
+      </choice>
+      <optional>
+        <ref name="info"/>
+      </optional>
+      <zeroOrMore>
+        <element name="idx">
+          <attribute name="name">
+            <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">name of the element to match; if this element lacks the attribute @handle, @name is used as the handle for the index</documentation>
+            <data type="string"/>
+          </attribute>
+          <optional>
+            <attribute name="ns">
+              <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">namespace (in DeReKo set to null, hence absent)</documentation>
+              <data type="anyURI"/>
+            </attribute>
+          </optional>
+          <optional>
+            <attribute name="key">
+              <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">namespace (in DeReKo set to null, hence absent)</documentation>
+              <data type="NCName"/>
+            </attribute>
+          </optional>
+          <optional>
+            <attribute name="extra">
+              <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">extra features to be turned into extra keys in the index; e.g., the @name may be "hi" and the @extra can be "rend", which causes an extra key, e.g. "rend:bold" to be associated with this span, and its payload is set to 'element:hi' to make sure about its origin; this attribute lists attribute names...</documentation>
+              <oneOrMore>
+                <text/>
+              </oneOrMore>
+            </attribute>
+          </optional>
+          <optional>
+            <ref name="fs"/>
+          </optional>
+        </element>
+      </zeroOrMore>
+    </element>
+  </define>
+  <!-- layer -->
+
+<define name="common_top_content">
+    <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">For @type="foundry"
+      and @type="document". In the centralized foundry list, these values would be
+      invalid.</documentation>
+  <attribute name="docid">
+    <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">This attribute is
+      crucial for document-level metadata, and should be the same across the header, the text,
+      and the metadata files.</documentation>
+    <data type="NCName"/>
+  </attribute>
+  <optional>
+    <attribute name="masked">
+      <data type="boolean"/>
+    </attribute>
+    <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">This attribute
+      should only apply do document-level foundries, I think, unless we use it more generally,
+      to mark withdrawn foundries (?)</documentation>
+  </optional>
+  <element name="doc">
+    <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">This element
+      makes it possible to create the file path to the raw text file, and the xpath to the
+      appropriate element.</documentation>
+    <attribute name="file">
+      <data type="normalizedString"/>
+    </attribute>
+  </element>
+  <zeroOrMore>
+    <element name="binary">
+      <attribute name="id">
+        <data type="ID"/>
+      </attribute>
+      <attribute name="file">
+        <data type="anyURI"/>
+      </attribute>
+    </element>
+  </zeroOrMore>
+</define>
+  <!-- common_top_content -->
+
+  <define name="info">
+    <element name="info">
+      <choice>
+        <text/>
+        <group>
+          <element name="tool">
+            <attribute name="name">
+              <data type="string"/>
+            </attribute>
+            <optional>
+              <attribute name="uri">
+                <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">sometimes
+                  a URI may be useful to identify the tool</documentation>
+                <data type="anyURI"/>
+              </attribute>
+            </optional>
+            <optional>
+              <attribute name="ver">
+                <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">version
+                  information</documentation>
+                <data type="string"/>
+              </attribute>
+            </optional>
+            <optional>
+              <attribute name="date">
+                <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">date it
+                  was used (can be provided in the changelog)</documentation>
+                <data type="date"/>
+              </attribute>
+            </optional>
+            <optional>
+              <attribute name="model">
+                <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">Model
+                  used to derive the output</documentation>
+                <data type="string"/>
+              </attribute>
+            </optional>
+            <optional>
+              <element name="changelog">
+                <oneOrMore>
+                  <element name="change">
+                    <attribute name="date">
+                      <data type="date"/>
+                    </attribute>
+                    <text/>
+                  </element>
+                </oneOrMore>
+              </element>
+            </optional>
+            <optional>
+              <element name="rem">
+                <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">for loose
+                  remarks</documentation>
+                <text/>
+              </element>
+            </optional>
+          </element>
+        </group>
+      </choice>
+      <optional>
+        <element name="rem">
+          <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">for loose
+            remarks</documentation>
+          <text/>
+        </element>
+      </optional>
+    </element>
+  </define>
+  <!-- info -->
+
+  <start>
+    <element name="metadata">
+
+      <choice>
+        <group>
+          <attribute name="type">
+            <value type="NCName">document</value>
+          </attribute>
+          <ref name="common_top_content"/>
+          <oneOrMore>
+            <element name="foundry">
+              <ref name="foundry_atts"/>
+              <attribute name="path">
+                <data type="normalizedString"/>
+              </attribute>
+            </element>
+          </oneOrMore>
+        </group>
+        
+        <group>
+          <attribute name="type">
+            <value type="NCName">foundry</value>
+          </attribute>
+          <ref name="common_top_content"/>
+          <oneOrMore>
+            <element name="foundry">
+              <ref name="non-document_top_content"/>
+              <ref name="foundry_atts"/>
+              <optional>
+                <ref name="info"/>
+              </optional>
+              <oneOrMore>
+                <ref name="layer"/>
+              </oneOrMore>
+            </element>
+          </oneOrMore>
+        </group>
+
+        <group>
+          <attribute name="type">
+            <value type="NCName">central</value>
+          </attribute>
+          <oneOrMore>
+            <element name="foundry">
+              <ref name="non-document_top_content"/>
+              <ref name="foundry_atts"/>
+              <optional>
+                <ref name="info"/>
+              </optional>
+              <oneOrMore>
+                <ref name="layer"/>
+              </oneOrMore>
+            </element>
+          </oneOrMore>
+        </group>
+        <!--<group>
+          <value type="NCName">speech</value>
+          <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">for lack of a
+            better word... the idea being that this is a piece of metadata that encodes some
+            information concerning the binary stream that is decomposed "downstairs" into
+            individual speaker transcription lines</documentation>
+        </group>-->
+        
+      </choice>
+    </element>
+  </start>
+  <include href="fsr.rng"/>
+</grammar>