Add first working conversion pipeline
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..7ec798d
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,55 @@
+SRC_DIR ?= test/resources/DNB
+BUILD_DIR = build
+TARGET_DIR ?= target
+
+
+
+.PHONY: all clean test
+
+
+all: $(TARGET_DIR)/dnb.i5.xml
+
+$(TARGET_DIR)/dnb.i5.xml: $(patsubst $(SRC_DIR)/%.epub,$(TARGET_DIR)/%.i5.xml,$(wildcard $(SRC_DIR)/*.epub))
+ head -n -1 xslt/idsCorpus-template.xml > $@
+ cat $^ >> $@
+ tail -n 1 xslt/idsCorpus-template.xml >> $@
+
+test: $(TARGET_DIR)/dnb.i5.xml xslt/epub2i5.xsl
+ xmllint --noout --valid $<
+
+$(BUILD_DIR)/%: $(SRC_DIR)/%.epub
+ mkdir -p $@
+ echo "Converting $< to $@"
+ unzip -q -o $< -d $@
+
+$(TARGET_DIR)/%.i5.xml: $(BUILD_DIR)/% xslt/epub2i5.xsl
+ mkdir -p $(TARGET_DIR)
+ echo "Converting $< to $@"
+ java -jar lib/saxon9ee.jar -xsl:xslt/epub2i5.xsl $</*/content.opf > $@
+
+%.zip: %.i5.xml
+ tei2korapxml -l warn -s -tk - < $< > $@
+
+%.tree_tagger.zip: %.zip
+ korapxml2conllu $< | pv | docker run --rm -i korap/conllu2treetagger -l german | conllu2korapxml > $@
+
+%.spacy.zip: %.zip
+ korapxml2conllu $< | pv | docker run --rm -i korap/conllu2spacy | conllu2korapxml > $@
+
+%.ud.zip: %.zip
+ korapxml2conllu $< | pv | ./scripts/udpipe2 | conllu2korapxml > $@
+
+%.cmc.zip: %.zip
+ korapxml2conllu $< | pv | conllu2cmc -s | conllu2korapxml > $@
+
+%.krill.tar: %.zip %.ud.zip %.cmc.zip
+ korapxml2krill archive --quiet -w -z -cfg krill-kokokom.cfg --non-word-tokens --meta I5 -i $< -i $(word 2,$^) -i $(word 3,$^) -o $(basename $@)
+
+json: *.krill.tar
+ rm -rf json
+ mkdir -p json
+ for f in $^; do tar -C json -xf $$f; done
+
+clean:
+ rm -rf $(BUILD_DIR) $(TARGET_DIR)
+
diff --git a/Readme.md b/Readme.md
index 677f60a..20fdd7d 100644
--- a/Readme.md
+++ b/Readme.md
@@ -1,7 +1,30 @@
-# EPub to TEI I5 conversion
+# EPub to KorAP (via TEI I5) conversion
+## Run
+
+### To generate I5 corpus
+
+```bash
+make target/dnb.i5.xml
+```
+
+### To generate the KorAP-XML ZIP
+
+Prerequisite: [KorAP-XML-CoNLL-U](https://github.com/KorAP/KorAP-XML-CoNLL-U)
+
+```bash
+make target/dnb.zip
+```
+
+### To generate Annotations
+
+```bash
+make target/dnb.spacy.zip target/dnb.tree_tagger.zip
+```
## News
+* 2024-03-16: first working pipeline for EPub ⮕ TEI I5 ⮕ KorAP-XML ⮕ (UDPipe+TreeTagger+Spacy) ⮕ Krill ⮕ KorAP-JSON
+
* 2024-03-15: DNB test data added
* 2024-03-08: example EPub and I5 added from DeReKo KJL corpus: *Christiane F. ; Kai Hermann ; Horst Rieck: Wir Kinder vom Bahnhof Zoo* in the folder [`test/resources/`](./test/resources/) – do not distribute (copyrighted data)
diff --git a/lib/saxon-license.lic b/lib/saxon-license.lic
new file mode 100644
index 0000000..c104d43
--- /dev/null
+++ b/lib/saxon-license.lic
@@ -0,0 +1,18 @@
+Licensor=Saxonica
+Licensee=Marc Kupietz
+Company=Institut für Deutsche Sprache
+Email=kupietz@ids-mannheim.de
+Edition=EE
+SAT=yes
+SAQ=yes
+SAV=yes
+Issued=2019-01-10
+Series=V
+Serial=V007508
+User=P0001
+Evaluation=no
+Expiration=never
+UpgradeDays=366
+MaintenanceDays=366
+
+Signature=302C02142962C7B427BE5221DF20508FB60CE956B3C762E7021415F7021BBF9EC2EF562E40C7651DE2120891BE73
\ No newline at end of file
diff --git a/lib/saxon9ee.jar b/lib/saxon9ee.jar
new file mode 100644
index 0000000..6035609
--- /dev/null
+++ b/lib/saxon9ee.jar
Binary files differ
diff --git a/scripts/section_helper.sh b/scripts/section_helper.sh
new file mode 100644
index 0000000..ddefe0d
--- /dev/null
+++ b/scripts/section_helper.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+# Reference: https://docs.gitlab.com/ee/ci/jobs/#custom-collapsible-sections
+
+#
+# Takes 2 Parameters a new section id and a heading/title
+#
+function start_section() {
+ id=$1
+ title=$2
+ echo -e "\e[0Ksection_start:$(date +%s):${id}[collapsed=true]\r\e[0K\e[36;1m${title}\e[0m"
+}
+
+#
+# Takes 1 Parameter, the unique section id of the section that should end
+#
+function end_section() {
+ id=$1
+ echo -e "\e[0Ksection_end:$(date +%s):${id}\r\e[0K"
+}
diff --git a/scripts/udpipe2 b/scripts/udpipe2
new file mode 100755
index 0000000..ac7eec4
--- /dev/null
+++ b/scripts/udpipe2
@@ -0,0 +1,55 @@
+#!/bin/bash
+
+usage() { echo "Usage: $0 [-r | -s <server>] [-m <model>] [-l] < c.conllu > c.ud.conllu" 1>&2; exit 1; }
+
+LOCAL_SERVER=http://compute.ids-mannheim.de:8001
+LINDAT_SERVER=https://lindat.mff.cuni.cz/services/udpipe/api
+
+server=${LOCAL_SERVER}
+model=de_hdt
+
+udpipe_server_is_operational () {
+ [ $(curl -s -o /dev/null -w "%{http_code}" ${1}/models) -eq 200 ]
+}
+
+if ! udpipe_server_is_operational $server; then
+ echo "WARING: Local server $server is not responding, defaulting to LINDAT server." >&2
+ server=$LINDAT_SERVER
+fi
+
+while getopts "s:m:rhl" o; do
+ case "${o}" in
+ r)
+ server=${LINDAT_SERVER}
+ ;;
+ s)
+ server=${OPTARG}
+ ;;
+ m)
+ model=${OPTARG}
+ ;;
+ l)
+ curl ${server}/models
+ exit 0
+ ;;
+ *)
+ usage
+ ;;
+ esac
+done
+shift $((OPTIND-1))
+
+if ! udpipe_server_is_operational $server; then
+ echo "ERROR: Udpipe server $server is not operational." >&2
+ exit -1
+fi
+
+idx=1
+while [[ $idx -gt 0 ]]; do
+ idx=0
+ while IFS= read -r line && ( [[ $idx -lt 120000 ]] || ! [[ -z "$line" ]] ); do
+ $(( idx++ )) 2> /dev/null
+ echo "$line"
+# echo "$line" >&2
+ done > >(curl --silent -F data=@- -F model=${model} -F tagger= -F parser= ${server}/process | jq -j .result )
+done
diff --git a/xslt/epub2i5.xsl b/xslt/epub2i5.xsl
new file mode 100644
index 0000000..c5f5c95
--- /dev/null
+++ b/xslt/epub2i5.xsl
@@ -0,0 +1,405 @@
+<xsl:stylesheet version="3.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+ xmlns:xs="http://www.w3.org/2001/XMLSchema"
+ xmlns:opf="http://www.idpf.org/2007/opf"
+ xmlns:dc="http://purl.org/dc/elements/1.1/"
+ xmlns:ids="http://www.ids-mannheim.de/ids"
+ xmlns:hlu="http://www.ids-mannheim.de/hlu"
+ xmlns:saxon="http://saxon.sf.net/"
+ xmlns:xhtml="http://www.w3.org/1999/xhtml"
+ exclude-result-prefixes="xs opf dc ids hlu saxon xhtml">
+
+ <xsl:output method="xml" indent="yes" omit-xml-declaration="yes" saxon:line-length="1000"/>
+ <xsl:strip-space elements="*"/>
+
+ <xsl:variable name="ev"/>
+ <xsl:variable name="x"/>
+
+ <xsl:variable name="isbn" as="xs:string" select="replace(document-uri(), '.*([0-9]{13,}).*' , '$1')"/>
+
+
+ <xsl:variable name="dnbBookdata">
+ <xsl:copy-of select="doc(concat('https://services.dnb.de/sru/dnb?version=1.1&operation=searchRetrieve&query=NUM%3D', $isbn, '&recordSchema=oai_dc'))"/>
+ </xsl:variable>
+
+ <xsl:variable name="autor"
+ select="string-join(replace($dnbBookdata//dc:creator, ' *\[[^\]]*\]', ''), ' ; ')"/>
+
+ <xsl:variable name="straight_autor" select="replace(hlu:reversedAuthors($autor), ',', '')"/>
+
+ <xsl:variable name="ina"/>
+ <xsl:variable name="_corpus"/>
+ <xsl:variable name="ent_known"/>
+
+
+ <!-- added HLU 2012-02-09: -->
+ <xsl:variable name="ent">
+ <xsl:choose>
+ <xsl:when test="$ent_known">
+ <xsl:value-of select="$ent_known"/>
+ </xsl:when>
+ <xsl:when test="$ev">
+ <xsl:value-of select="$ev"/>
+ </xsl:when>
+ <xsl:otherwise>
+ <xsl:value-of select="$j"/>
+ </xsl:otherwise>
+ </xsl:choose>
+ </xsl:variable>
+
+ <xsl:variable name="titel">
+ <xsl:choose>
+ <xsl:when test="contains($dnbBookdata//dc:title,':')">
+ <xsl:value-of select="normalize-space(substring-before(substring-before($dnbBookdata//dc:title, '/'), ':'))"
+ />
+ </xsl:when>
+ <xsl:otherwise>
+ <xsl:value-of select="normalize-space(substring-before($dnbBookdata//dc:title, '/'))"/>
+ </xsl:otherwise>
+ </xsl:choose>
+ </xsl:variable>
+
+ <xsl:variable name="erscheinungsort">
+ <xsl:choose>
+ <xsl:when test="contains($dnbBookdata//dc:publisher,':')">
+ <xsl:value-of select="normalize-space(substring-before($dnbBookdata//dc:publisher, ':'))"/>
+ </xsl:when>
+ <xsl:otherwise>
+ <xsl:value-of select="normalize-space($dnbBookdata//dc:publisher)"/>
+ </xsl:otherwise>
+ </xsl:choose>
+ </xsl:variable>
+
+ <xsl:variable name="verlag">
+ <xsl:choose>
+ <xsl:when test="contains($dnbBookdata//dc:publisher,':')">
+ <xsl:value-of select="normalize-space(substring-after($dnbBookdata//dc:publisher, ':'))"/>
+ </xsl:when>
+ <xsl:otherwise>
+ <xsl:value-of select="normalize-space($dnbBookdata//dc:publisher)"/>
+ </xsl:otherwise>
+ </xsl:choose>
+ </xsl:variable>
+
+ <xsl:variable name="erscheinungsjahr">
+ <xsl:choose>
+ <xsl:when test="matches($dnbBookdata//dc:date, '^[0-9]{4}$')">
+ <xsl:value-of select="$dnbBookdata//dc:date"/>
+ </xsl:when>
+ <xsl:otherwise>
+ <xsl:value-of select="substring-before($dnbBookdata//dc:date, '-')"/>
+ </xsl:otherwise>
+ </xsl:choose>
+ </xsl:variable>
+
+ <xsl:variable name="untertitel"
+ select="normalize-space(substring-after(substring-before($dnbBookdata//dc:title, '/'), ':'))"/>
+
+ <xsl:variable name="herausgeber">
+ <xsl:choose>
+ <xsl:when test="$dnbBookdata//dc:creator[ends-with(.,'[Hrsg.]')]">
+ <xsl:value-of
+ select="replace(string-join($dnbBookdata//dc:creator[ends-with(.,'[Hrsg.]')], ' ; '),'\s?\[Hrsg.\]','')"
+ />
+ </xsl:when>
+ <xsl:otherwise>.</xsl:otherwise>
+ </xsl:choose>
+ </xsl:variable>
+
+ <xsl:variable name="straight_herausgeber"
+ select="replace(hlu:reversedAuthors($herausgeber), ',', '')"/>
+
+ <xsl:variable name="j" select="$dnbBookdata//dc:date"/>
+
+ <!-- for BOT+s: -->
+ <xsl:variable name="seiten" select="replace($dnbBookdata//dc:format,'S\.','')"/>
+
+ <!-- fuer BOT+b: -->
+ <xsl:variable name="_b">
+ <xsl:variable name="regexp1" select="'(Band|Bd\.)\s*([0-9]?[0-9]?[0-9])'"/>
+ <xsl:choose>
+ <xsl:when test="matches($dnbBookdata, $regexp1)">
+ <xsl:analyze-string select="$dnbBookdata//dc:title" regex="{$regexp1}">
+ <xsl:matching-substring>
+ <xsl:value-of select="."/>
+ </xsl:matching-substring>
+ </xsl:analyze-string>
+ </xsl:when>
+ <xsl:otherwise>
+ <xsl:value-of select="'.'"/>
+ </xsl:otherwise>
+ </xsl:choose>
+ </xsl:variable>
+
+ <!-- for BOT+x: -->
+ <xsl:variable name="txtart">
+ <xsl:choose>
+ <xsl:when test="$x">
+ <xsl:value-of select="concat('[', $x, ']')"/>
+ </xsl:when>
+ <xsl:when
+ test="matches($untertitel, '([Rr]oman|[Ee]rzhlung(en)?|[Aa]nthologie|[Gg]eschichte(n)?|[Nn]ovelle)')">
+ <xsl:value-of
+ select="concat('[', replace(replace($untertitel, '.*?(((^|\P{L})\p{L}+)?([Rr]oman|[Ee]rzhlung(en)?|[Aa]nthologie|[Gg]eschichte(n)?|[Nn]ovelle)).*', '$1'), '\P{L}*(.+)', '$1'), ']')"
+ />
+ </xsl:when>
+ <xsl:otherwise>
+ <xsl:value-of>Roman</xsl:value-of>
+ </xsl:otherwise>
+ </xsl:choose>
+ </xsl:variable>
+
+
+
+ <!-- fuer BOTd: -->
+ <xsl:variable name="dok"
+ select="concat((if(string-length($autor) > 0) then concat($straight_autor, ': ') else ''), $titel, ', ', $txtart, ', (', $j, ')')"/>
+
+ <!-- END variables derived from sru request to dnb archive -->
+
+
+ <xsl:variable name="corpus_sigle" select="'DNB'"/>
+
+ <!-- for BOTD: -->
+ <!-- Dokumentsigle muss zusammen mit Korpussigle (z.B DIV fuer loz-div und loz-div-pub) eindeutig sein -->
+ <xsl:variable name="doc_sigle">
+ <xsl:variable name="firstContentWordTitleInitial">
+ <xsl:variable name="helper">
+ <xsl:analyze-string select="$titel" regex="\w+">
+ <xsl:matching-substring>
+ <xsl:choose>
+ <xsl:when
+ test="matches(.,'^[A-Z]') and not(matches(.,'^(Der|Die|Das|Des|Ein|Eine|Eines|Einmal|Von|Mit|Zu|Zurck)$'))">
+ <!-- TODO: Fktnswoerter nachtragen -->
+ <xsl:sequence select="."/>
+ </xsl:when>
+ <xsl:otherwise/>
+ </xsl:choose>
+ </xsl:matching-substring>
+ </xsl:analyze-string>
+ </xsl:variable>
+ <xsl:value-of
+ select="upper-case(substring(normalize-space(replace($helper,'\s+.+$','')),1,1))"/>
+ <!-- longest match of .+ -->
+ </xsl:variable>
+ <xsl:choose>
+ <xsl:when test="contains($autor,';')">
+ <xsl:variable name="lastname_aut1"
+ select="upper-case(substring(normalize-space(substring-before(substring-before($autor,';'),',')), 1, 1))"/>
+ <xsl:variable name="lastname_aut2"
+ select="upper-case(substring(normalize-space(substring-before(substring-before(substring-after($autor, ';'),';'),',')), 1, 1))"/>
+ <xsl:value-of select="concat($lastname_aut1, $lastname_aut2)"/>
+ </xsl:when>
+ <xsl:otherwise>
+ <xsl:variable name="lastname_aut1"
+ select="upper-case(substring(normalize-space(substring-before($autor,',')),1,1))"/>
+ <xsl:variable name="firstname_aut1"
+ select="upper-case(substring(normalize-space(substring-after($autor,',')),1,1))"/>
+ <xsl:value-of select="concat($lastname_aut1, $firstname_aut1)"/>
+ </xsl:otherwise>
+ </xsl:choose>
+ <xsl:value-of select="$firstContentWordTitleInitial"/>
+ </xsl:variable>
+
+
+ <xsl:variable name="text_sigle" select="substring($isbn, 8, 5)"/>
+ <xsl:variable name="sigle" select="concat($corpus_sigle, '/', $doc_sigle, '.', $text_sigle)"/>
+
+ <!-- fuer BOT+xy: (?) -->
+ <xsl:variable name="xyref">
+ <xsl:value-of select="document-uri(.)"/>
+ <xsl:text>; </xsl:text>
+ <xsl:text>ISBN:</xsl:text>
+ <xsl:value-of select="$isbn"/>
+ <xsl:text>; </xsl:text>
+ <xsl:value-of select="string-join($dnbBookdata//dc:identifier)"/>
+ </xsl:variable>
+
+
+ <xsl:template match="/">
+ <idsDoc TEIform="TEI.2" type="text" version="1.0">
+ <idsHeader TEIform="teiHeader" pattern="text" status="new" type="document" version="1.1">
+ <fileDesc>
+ <titleStmt>
+ <dokumentSigle>KJL/HRK</dokumentSigle>
+ <d.title>Christiane F. ; Kai Hermann ; Horst Rieck: Wir Kinder vom Bahnhof Zoo , [Jugendliteratur], (2011)</d.title>
+ </titleStmt>
+ <publicationStmt>
+ <distributor/>
+ <pubAddress/>
+ <availability region="world" status="unknown">QAO-NC</availability>
+ <pubDate/>
+ </publicationStmt>
+ <sourceDesc>
+ <biblStruct>
+ <monogr>
+ <h.title type="main"/>
+ <imprint/>
+ </monogr>
+ </biblStruct>
+ </sourceDesc>
+ </fileDesc>
+ </idsHeader>
+ <idsText version="1.0">
+ <idsHeader TEIform="teiHeader" pattern="text" status="new" type="text" version="1.1">
+ <fileDesc>
+ <titleStmt>
+ <textSigle><xsl:sequence select="$sigle"/></textSigle>
+ <t.title assemblage="regular">KJL/HRK.00001 F., Christiane ; Hermann, Kai ; Rieck, Horst: Wir Kinder vom Bahnhof Zoo, [Jugendliteratur], Erstv. 1978. - Hamburg, 2011</t.title>
+ </titleStmt>
+ <publicationStmt>
+ <distributor/>
+ <pubAddress/>
+ <availability region="world" status="unknown">QAO-NC</availability>
+ <pubDate/>
+ </publicationStmt>
+ <sourceDesc>
+ <biblStruct>
+ <monogr>
+ <h.title type="main"><xsl:value-of select="$titel"/></h.title>
+ <h.title type="sub"><xsl:value-of select="$untertitel"/></h.title>
+ <h.author><xsl:value-of select="$autor"/></h.author>
+ <editor/>
+ <edition>
+ <further/>
+ <kind>E-Book-Ausgabe</kind>
+ <appearance>EPUB-Datei</appearance>
+ </edition>
+ <imprint>
+ <publisher><xsl:value-of select="$herausgeber"/></publisher>
+ <pubDate type="year"><xsl:value-of select="$j"/></pubDate>
+ <pubDate type="month"/>
+ <pubDate type="day"/>
+ <pubPlace key="DE"><xsl:value-of select="$erscheinungsort"/></pubPlace>
+ </imprint>
+ <biblScope type="subsume"/>
+ <biblScope type="pp"/>
+ <biblScope type="vol"/>
+ <biblScope type="volume-title"/>
+ </monogr>
+ </biblStruct>
+ <reference assemblage="regular" type="complete"><xsl:value-of select="concat($sigle, ' ', $autor, ': ', $titel, '. ', $erscheinungsort, ': ', $verlag, ', ', $erscheinungsjahr)"/></reference>
+ <reference assemblage="regular" type="short">KJL/HRK.00001 F. ; Hermann ; Rieck: Wir Kinder vom Bahnhof Zoo, Erstv. 1978, 2011</reference>
+ </sourceDesc>
+ </fileDesc>
+ <profileDesc>
+ <creation>
+ <creatDate><xsl:value-of select="$j"/></creatDate>
+ </creation>
+ <textClass/>
+ <textDesc>
+ <textType>Jugendliteratur</textType>
+ <textTypeRef>Jugendliteratur</textTypeRef>
+ <textDomain/>
+ </textDesc>
+ </profileDesc>
+ </idsHeader>
+ <text>
+ <body>
+ <!-- Call the template for each link in the TOC
+ <xsl:apply-templates select="//xhtml:ol[@class='toc']/xhtml:li/xhtml:a" mode="collect"/> -->
+ <xsl:apply-templates select="//opf:package/opf:manifest/opf:item[matches(@href, '\.x?html$') and not(matches(@href, '(cover|toc|copyright|feedback).*'))]" mode="collect"/>
+ </body>
+ </text>
+ </idsText>
+ </idsDoc>
+ </xsl:template>
+
+ <xsl:template match="opf:item" mode="collect">
+ <xsl:variable name="href" select="@href"/>
+ <xsl:message>
+ <xsl:text>converting: </xsl:text><xsl:value-of select="$href"/><xsl:text> </xsl:text><xsl:value-of select="$isbn"/>
+ </xsl:message>
+ <xsl:apply-templates select="doc(resolve-uri($href, base-uri()))/xhtml:html/xhtml:body"/>
+ </xsl:template>
+
+ <xsl:template match="xhtml:body">
+ <div type="chapter">
+ <xsl:apply-templates/>
+ </div>
+ </xsl:template>
+
+ <xsl:template match="xhtml:title">
+ <head>
+ <xsl:apply-templates/>
+ </head>
+ </xsl:template>
+
+ <xsl:template match="xhtml:h1">
+ <head>
+ <xsl:apply-templates/>
+ </head>
+ </xsl:template>
+
+ <xsl:template match="xhtml:h2|xhtml:h3">
+ <head type="sub">
+ <xsl:apply-templates/>
+ </head>
+ </xsl:template>
+
+
+ <xsl:template match="xhtml:span[@class='italic']">
+ <hi rend="italic">
+ <xsl:apply-templates/>
+ </hi>
+ </xsl:template>
+
+ <xsl:template match="xhtml:span[@class='bold']">
+ <hi rend="bold">
+ <xsl:apply-templates/>
+ </hi>
+ </xsl:template>
+
+ <xsl:template match="xhtml:span[@class='sub']">
+ <hi rend="sub">
+ <xsl:apply-templates/>
+ </hi>
+ </xsl:template>
+
+ <xsl:template match="xhtml:span[@class='sup']">
+ <hi rend="sup">
+ <xsl:apply-templates/>
+ </hi>
+ </xsl:template>
+
+ <xsl:template match="xhtml:div">
+ <div type="section">
+ <xsl:apply-templates/>
+ </div>
+ </xsl:template>
+
+ <xsl:template match="xhtml:p">
+ <p>
+ <xsl:apply-templates/>
+ </p>
+ </xsl:template>
+
+ <xsl:template match="xhtml:img">
+ <!-- <gap reason="image"/> -->
+ </xsl:template>
+
+ <xsl:template match="xhtml:*">
+ <xsl:message>
+ <xsl:text>unhandled element: </xsl:text><xsl:value-of select="concat(name(), ' ', string-join(./@*[normalize-space(.) != '']/concat(name(), ':', ., ' '), '_'))"/>
+ </xsl:message>
+ <xsl:apply-templates/>
+ </xsl:template>
+
+ <xsl:function name="ids:reversedAuthors">
+ <xsl:param name="s" />
+ <xsl:value-of
+ select="
+ if (matches($s, ';')) then
+ concat(ids:reversedAuthors(substring-before($s, ' ; ')), ' ; ', ids:reversedAuthors(substring-after($s, ' ; ')))
+ else
+ replace($s, '(.+) (.+)', '$2, $1')"
+ />
+ </xsl:function>
+
+ <xsl:function name="hlu:reversedAuthors">
+ <xsl:param name="s"/>
+ <xsl:value-of
+ select="if (matches($s, ';')) then concat(ids:reversedAuthors(substring-before($s, ' ; ')), ' ; ', ids:reversedAuthors(substring-after($s, ' ; '))) else replace($s, '(.+),(.+)', '$2, $1')"
+ />
+ </xsl:function>
+
+</xsl:stylesheet>
diff --git a/xslt/idsCorpus-template.xml b/xslt/idsCorpus-template.xml
new file mode 100644
index 0000000..7ff9f0f
--- /dev/null
+++ b/xslt/idsCorpus-template.xml
@@ -0,0 +1,242 @@
+<?xml version="1.0" encoding="utf-8"?>
+<!DOCTYPE idsCorpus PUBLIC "-//IDS//DTD IDS-I5 1.0//EN" "http://corpora.ids-mannheim.de/I5/DTD/i5.dtd">
+<idsCorpus xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" TEIform="teiCorpus.2" version="1.0">
+ <idsHeader TEIform="teiHeader" pattern="allesaußerZtg/Zschr" status="new" type="corpus" version="1.1">
+ <fileDesc>
+ <titleStmt>
+ <korpusSigle>DNB</korpusSigle>
+ <c.title>Deutschsprachige Belletristik</c.title>
+ </titleStmt>
+ <publicationStmt>
+ <distributor> Institut für Deutsche Sprache </distributor>
+ <pubAddress> Postfach 10 16 21, D-68016 Mannheim </pubAddress>
+ <telephone> +49 (0)621 1581 0 </telephone>
+ <availability region="world" status="unknown">QAO-NC</availability>
+ <pubDate/>
+ </publicationStmt>
+ <sourceDesc>
+ <biblStruct>
+ <monogr>
+ <h.title type="main"/>
+ <imprint/>
+ </monogr>
+ </biblStruct>
+ </sourceDesc>
+ </fileDesc>
+ <encodingDesc>
+ <projectDesc/>
+ <samplingDecl/>
+ <editorialDecl>
+ <transduction>korap4dnb</transduction>
+ <pagination type="no"/>
+ </editorialDecl>
+ <classDecl>
+ <taxonomy id="topic">
+ <h.bibl>Thementaxonomie (siehe http://www.ids-mannheim.de/kl/projekte/methoden/te.html)</h.bibl>
+ <category id="topic.fiktion">
+ <catDesc>Fiktion</catDesc>
+ <category id="topic.fiktion.vermischtes">
+ <catDesc>Fiktion:Vermischtes</catDesc>
+ </category>
+ </category>
+ <category id="topic.freizeit-unterhaltung">
+ <catDesc>Freizeit_Unterhaltung</catDesc>
+ <category id="topic.freizeit-unterhaltung.reisen">
+ <catDesc>Freizeit_Unterhaltung:Reisen</catDesc>
+ </category>
+ <category id="topic.freizeit-unterhaltung.rundfunk">
+ <catDesc>Freizeit_Unterhaltung:Rundfunk</catDesc>
+ </category>
+ <category id="topic.freizeit-unterhaltung.vereine-veranstaltungen">
+ <catDesc>Freizeit_Unterhaltung:Vereine_Veranstaltungen</catDesc>
+ </category>
+ </category>
+ <category id="topic.gesundheit-ernaehrung">
+ <catDesc>Gesundheit_Ernaehrung</catDesc>
+ <category id="topic.gesundheit-ernaehrung.ernaehrung">
+ <catDesc>Gesundheit_Ernaehrung:Ernaehrung</catDesc>
+ </category>
+ <category id="topic.gesundheit-ernaehrung.gesundheit">
+ <catDesc>Gesundheit_Ernaehrung:Gesundheit</catDesc>
+ </category>
+ </category>
+ <category id="topic.kultur">
+ <catDesc>Kultur</catDesc>
+ <category id="topic.kultur.bildende-kunst">
+ <catDesc>Kultur:Bildende Kunst</catDesc>
+ </category>
+ <category id="topic.kultur.darstellende-kunst">
+ <catDesc>Kultur:Darstellende Kunst</catDesc>
+ </category>
+ <category id="topic.kultur.film">
+ <catDesc>Kultur:Film</catDesc>
+ </category>
+ <category id="topic.kultur.literatur">
+ <catDesc>Kultur:Literatur</catDesc>
+ </category>
+ <category id="topic.kultur.mode">
+ <catDesc>Kultur:Mode</catDesc>
+ </category>
+ <category id="topic.kultur.musik">
+ <catDesc>Kultur:Musik</catDesc>
+ </category>
+ </category>
+ <category id="topic.natur-umwelt">
+ <catDesc>Natur_Umwelt</catDesc>
+ <category id="topic.natur-umwelt.garten">
+ <catDesc>Natur_Umwelt:Garten</catDesc>
+ </category>
+ <category id="topic.natur-umwelt.tiere">
+ <catDesc>Natur_Umwelt:Tiere</catDesc>
+ </category>
+ <category id="topic.natur-umwelt.wetter-klima">
+ <catDesc>Natur_Umwelt:Wetter_Klima</catDesc>
+ </category>
+ </category>
+ <category id="topic.politik">
+ <catDesc>Politik</catDesc>
+ <category id="topic.politik.ausland">
+ <catDesc>Politik:Ausland</catDesc>
+ </category>
+ <category id="topic.politik.inland">
+ <catDesc>Politik:Inland</catDesc>
+ </category>
+ <category id="topic.politik.kommunalpolitik">
+ <catDesc>Politik:Kommunalpolitik</catDesc>
+ </category>
+ </category>
+ <category id="topic.rest">
+ <catDesc>Rest</catDesc>
+ <category id="topic.rest.boersenkurse">
+ <catDesc>Rest:boersenkurse</catDesc>
+ </category>
+ <category id="topic.rest.geburt-tod-heirat">
+ <catDesc>Rest:geburt_tod_heirat</catDesc>
+ </category>
+ <category id="topic.rest.impressum">
+ <catDesc>Rest:impressum</catDesc>
+ </category>
+ <category id="topic.rest.inhaltsverzeichnisse">
+ <catDesc>Rest:inhaltsverzeichnisse</catDesc>
+ </category>
+ <category id="topic.rest.ligatabellen">
+ <catDesc>Rest:ligatabellen</catDesc>
+ </category>
+ <category id="topic.rest.tabellen">
+ <catDesc>Rest:tabellen</catDesc>
+ </category>
+ <category id="topic.rest.veranstaltungshinweise">
+ <catDesc>Rest:veranstaltungshinweise</catDesc>
+ </category>
+ </category>
+ <category id="topic.sport">
+ <catDesc>Sport</catDesc>
+ <category id="topic.sport.ballsport">
+ <catDesc>Sport:Ballsport</catDesc>
+ </category>
+ <category id="topic.sport.fussball">
+ <catDesc>Sport:Fussball</catDesc>
+ </category>
+ <category id="topic.sport.motorsport">
+ <catDesc>Sport:Motorsport</catDesc>
+ </category>
+ <category id="topic.sport.radsport">
+ <catDesc>Sport:Radsport</catDesc>
+ </category>
+ <category id="topic.sport.tennis">
+ <catDesc>Sport:Tennis</catDesc>
+ </category>
+ <category id="topic.sport.vermischtes">
+ <catDesc>Sport:Vermischtes</catDesc>
+ </category>
+ <category id="topic.sport.wintersport">
+ <catDesc>Sport:Wintersport</catDesc>
+ </category>
+ </category>
+ <category id="topic.staat-gesellschaft">
+ <catDesc>Staat_Gesellschaft</catDesc>
+ <category id="topic.staat-gesellschaft.arbeit-und-beruf">
+ <catDesc>Staat_Gesellschaft:Arbeit_und_Beruf</catDesc>
+ </category>
+ <category id="topic.staat-gesellschaft.bildung">
+ <catDesc>Staat_Gesellschaft:Bildung</catDesc>
+ </category>
+ <category id="topic.staat-gesellschaft.biographien-interviews">
+ <catDesc>Staat_Gesellschaft:Biographien_Interviews</catDesc>
+ </category>
+ <category id="topic.staat-gesellschaft.drittes-reich-rechtsextremismus">
+ <catDesc>Staat_Gesellschaft:Drittes_Reich_Rechtsextremismus</catDesc>
+ </category>
+ <category id="topic.staat-gesellschaft.familie-geschlecht">
+ <catDesc>Staat_Gesellschaft:Familie_Geschlecht</catDesc>
+ </category>
+ <category id="topic.staat-gesellschaft.kirche">
+ <catDesc>Staat_Gesellschaft:Kirche</catDesc>
+ </category>
+ <category id="topic.staat-gesellschaft.recht">
+ <catDesc>Staat_Gesellschaft:Recht</catDesc>
+ </category>
+ <category id="topic.staat-gesellschaft.tod">
+ <catDesc>Staat_Gesellschaft:Tod</catDesc>
+ </category>
+ <category id="topic.staat-gesellschaft.verbrechen">
+ <catDesc>Staat_Gesellschaft:Verbrechen</catDesc>
+ </category>
+ </category>
+ <category id="topic.technik-industrie">
+ <catDesc>Technik_Industrie</catDesc>
+ <category id="topic.technik-industrie.edv-elektronik">
+ <catDesc>Technik_Industrie:EDV_Elektronik</catDesc>
+ </category>
+ <category id="topic.technik-industrie.kfz">
+ <catDesc>Technik_Industrie:Kfz</catDesc>
+ </category>
+ <category id="topic.technik-industrie.transport-verkehr">
+ <catDesc>Technik_Industrie:Transport_Verkehr</catDesc>
+ </category>
+ <category id="topic.technik-industrie.umweltschutz">
+ <catDesc>Technik_Industrie:Umweltschutz</catDesc>
+ </category>
+ <category id="topic.technik-industrie.unfaelle">
+ <catDesc>Technik_Industrie:Unfaelle</catDesc>
+ </category>
+ </category>
+ <category id="topic.wirtschaft-finanzen">
+ <catDesc>Wirtschaft_Finanzen</catDesc>
+ <category id="topic.wirtschaft-finanzen.banken">
+ <catDesc>Wirtschaft_Finanzen:Banken</catDesc>
+ </category>
+ <category id="topic.wirtschaft-finanzen.bilanzen">
+ <catDesc>Wirtschaft_Finanzen:Bilanzen</catDesc>
+ </category>
+ <category id="topic.wirtschaft-finanzen.oeffentliche-finanzen">
+ <catDesc>Wirtschaft_Finanzen:Oeffentliche_Finanzen</catDesc>
+ </category>
+ <category id="topic.wirtschaft-finanzen.sozialprodukt">
+ <catDesc>Wirtschaft_Finanzen:Sozialprodukt</catDesc>
+ </category>
+ <category id="topic.wirtschaft-finanzen.waehrung">
+ <catDesc>Wirtschaft_Finanzen:Waehrung</catDesc>
+ </category>
+ </category>
+ <category id="topic.wissenschaft">
+ <catDesc>Wissenschaft</catDesc>
+ <category id="topic.wissenschaft.populaerwissenschaft">
+ <catDesc>Wissenschaft:Populaerwissenschaft</catDesc>
+ </category>
+ </category>
+ <category id="topic.unklassifizierbar">
+ <catDesc>Text ist thematisch nicht klassifizierbar.</catDesc>
+ </category>
+ </taxonomy>
+ </classDecl>
+ </encodingDesc>
+ <profileDesc>
+ <langUsage>
+ <language id="de" usage="100">Deutsch</language>
+ </langUsage>
+ <textDesc/>
+ </profileDesc>
+ </idsHeader>
+
+</idsCorpus>