Add first working conversion pipeline

commit: 1a42266168a011af3e8c592221c4c9f8118700d0 [log] [tgz]
author: Marc Kupietz <kupietz@ids-mannheim.de> Sat Mar 16 09:34:10 2024 +0100
committer: Marc Kupietz <kupietz@ids-mannheim.de> Sat Mar 16 09:34:10 2024 +0100
tree: 920b02f23fcc68246d301a036ce41aa0fb4fe7ad
parent: 7747c11f1152645b938c487f38e16b6b91b12d4f [diff]
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..7ec798d
--- /dev/null
+++ b/Makefile

@@ -0,0 +1,55 @@
+SRC_DIR ?= test/resources/DNB
+BUILD_DIR = build
+TARGET_DIR ?= target
+
+
+
+.PHONY: all clean test
+
+
+all: $(TARGET_DIR)/dnb.i5.xml
+
+$(TARGET_DIR)/dnb.i5.xml: $(patsubst $(SRC_DIR)/%.epub,$(TARGET_DIR)/%.i5.xml,$(wildcard $(SRC_DIR)/*.epub))
+	head -n -1 xslt/idsCorpus-template.xml > $@
+	cat $^ >> $@
+	tail -n 1 xslt/idsCorpus-template.xml  >> $@
+
+test: $(TARGET_DIR)/dnb.i5.xml xslt/epub2i5.xsl
+	xmllint --noout --valid $<
+
+$(BUILD_DIR)/%: $(SRC_DIR)/%.epub
+	mkdir -p $@
+	echo "Converting $< to $@"
+	unzip -q -o $< -d $@
+
+$(TARGET_DIR)/%.i5.xml: $(BUILD_DIR)/% xslt/epub2i5.xsl
+	mkdir -p $(TARGET_DIR)
+	echo "Converting $< to $@"
+	java -jar lib/saxon9ee.jar -xsl:xslt/epub2i5.xsl $</*/content.opf > $@
+
+%.zip: %.i5.xml
+	tei2korapxml -l warn -s -tk - < $< > $@
+
+%.tree_tagger.zip: %.zip
+	korapxml2conllu $< | pv | docker run --rm -i korap/conllu2treetagger -l german | conllu2korapxml > $@
+
+%.spacy.zip: %.zip
+	korapxml2conllu $< | pv | docker run --rm -i korap/conllu2spacy | conllu2korapxml > $@
+
+%.ud.zip: %.zip
+	korapxml2conllu $< | pv | ./scripts/udpipe2 | conllu2korapxml > $@
+
+%.cmc.zip: %.zip
+	korapxml2conllu $< | pv | conllu2cmc -s | conllu2korapxml > $@
+
+%.krill.tar: %.zip %.ud.zip %.cmc.zip
+	korapxml2krill archive --quiet -w -z -cfg krill-kokokom.cfg --non-word-tokens --meta I5 -i $< -i $(word 2,$^) -i $(word 3,$^) -o $(basename $@)
+
+json: *.krill.tar
+	rm -rf json
+	mkdir -p json
+	for f in $^; do tar -C json -xf $$f; done
+
+clean:
+	rm -rf $(BUILD_DIR) $(TARGET_DIR)
+

diff --git a/Readme.md b/Readme.md
index 677f60a..20fdd7d 100644
--- a/Readme.md
+++ b/Readme.md

@@ -1,7 +1,30 @@
-# EPub to TEI I5 conversion
+# EPub to KorAP (via TEI I5) conversion
 
+## Run
+
+### To generate I5 corpus
+
+```bash
+make target/dnb.i5.xml
+```
+
+### To generate the KorAP-XML ZIP
+
+Prerequisite: [KorAP-XML-CoNLL-U](https://github.com/KorAP/KorAP-XML-CoNLL-U)
+
+```bash
+make target/dnb.zip
+```
+
+### To generate Annotations
+
+```bash
+make target/dnb.spacy.zip target/dnb.tree_tagger.zip
+```
 ## News
 
+* 2024-03-16: first working pipeline for EPub ⮕ TEI I5 ⮕ KorAP-XML ⮕ (UDPipe+TreeTagger+Spacy) ⮕ Krill ⮕ KorAP-JSON
+
 * 2024-03-15: DNB test data added
 
 * 2024-03-08: example EPub and I5 added from DeReKo KJL corpus: *Christiane F. ; Kai Hermann ; Horst Rieck: Wir Kinder vom Bahnhof Zoo* in the folder [`test/resources/`](./test/resources/)  – do not distribute (copyrighted data)

diff --git a/lib/saxon-license.lic b/lib/saxon-license.lic
new file mode 100644
index 0000000..c104d43
--- /dev/null
+++ b/lib/saxon-license.lic

@@ -0,0 +1,18 @@
+Licensor=Saxonica
+Licensee=Marc Kupietz
+Company=Institut für Deutsche Sprache
+Email=kupietz@ids-mannheim.de
+Edition=EE
+SAT=yes
+SAQ=yes
+SAV=yes
+Issued=2019-01-10
+Series=V
+Serial=V007508
+User=P0001
+Evaluation=no
+Expiration=never
+UpgradeDays=366
+MaintenanceDays=366
+
+Signature=302C02142962C7B427BE5221DF20508FB60CE956B3C762E7021415F7021BBF9EC2EF562E40C7651DE2120891BE73
\ No newline at end of file

diff --git a/lib/saxon9ee.jar b/lib/saxon9ee.jar
new file mode 100644
index 0000000..6035609
--- /dev/null
+++ b/lib/saxon9ee.jar
Binary files differ

diff --git a/scripts/section_helper.sh b/scripts/section_helper.sh
new file mode 100644
index 0000000..ddefe0d
--- /dev/null
+++ b/scripts/section_helper.sh

@@ -0,0 +1,20 @@
+#!/bin/bash
+
+# Reference: https://docs.gitlab.com/ee/ci/jobs/#custom-collapsible-sections
+
+#
+# Takes 2 Parameters a new section id and a heading/title
+#
+function start_section() {
+  id=$1
+  title=$2
+  echo -e "\e[0Ksection_start:$(date +%s):${id}[collapsed=true]\r\e[0K\e[36;1m${title}\e[0m"
+}
+
+#
+# Takes 1 Parameter, the unique section id of the section that should end
+#
+function end_section() {
+  id=$1
+  echo -e "\e[0Ksection_end:$(date +%s):${id}\r\e[0K"
+}

diff --git a/scripts/udpipe2 b/scripts/udpipe2
new file mode 100755
index 0000000..ac7eec4
--- /dev/null
+++ b/scripts/udpipe2

@@ -0,0 +1,55 @@
+#!/bin/bash
+
+usage() { echo "Usage: $0 [-r | -s <server>] [-m <model>] [-l] < c.conllu > c.ud.conllu" 1>&2; exit 1; }
+
+LOCAL_SERVER=http://compute.ids-mannheim.de:8001
+LINDAT_SERVER=https://lindat.mff.cuni.cz/services/udpipe/api
+
+server=${LOCAL_SERVER}
+model=de_hdt
+
+udpipe_server_is_operational () {
+  [ $(curl -s -o /dev/null -w "%{http_code}" ${1}/models) -eq 200 ]
+}
+
+if ! udpipe_server_is_operational $server; then 
+    echo "WARING: Local server $server is not responding, defaulting to LINDAT server." >&2
+    server=$LINDAT_SERVER
+fi
+
+while getopts "s:m:rhl" o; do
+    case "${o}" in
+        r)
+            server=${LINDAT_SERVER}
+            ;;
+        s)
+            server=${OPTARG}
+            ;;
+        m)
+            model=${OPTARG}
+            ;;
+        l)
+            curl ${server}/models
+            exit 0
+            ;;
+        *)
+            usage
+            ;;
+    esac
+done
+shift $((OPTIND-1))
+
+if ! udpipe_server_is_operational $server; then 
+    echo "ERROR: Udpipe server $server is not operational." >&2
+    exit -1
+fi
+
+idx=1
+while [[ $idx -gt 0 ]]; do
+    idx=0
+    while IFS= read -r line && ( [[ $idx -lt 120000 ]] || ! [[ -z "$line" ]] ); do
+        $(( idx++ )) 2> /dev/null
+        echo "$line"
+#	echo "$line" >&2
+    done  > >(curl --silent -F data=@- -F model=${model} -F tagger= -F parser= ${server}/process | jq -j .result )
+done

diff --git a/xslt/epub2i5.xsl b/xslt/epub2i5.xsl
new file mode 100644
index 0000000..c5f5c95
--- /dev/null
+++ b/xslt/epub2i5.xsl

@@ -0,0 +1,405 @@
+<xsl:stylesheet version="3.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+                xmlns:xs="http://www.w3.org/2001/XMLSchema"
+                xmlns:opf="http://www.idpf.org/2007/opf"
+                xmlns:dc="http://purl.org/dc/elements/1.1/"
+                xmlns:ids="http://www.ids-mannheim.de/ids"
+                xmlns:hlu="http://www.ids-mannheim.de/hlu"
+                xmlns:saxon="http://saxon.sf.net/"
+                xmlns:xhtml="http://www.w3.org/1999/xhtml"
+                exclude-result-prefixes="xs opf dc ids hlu saxon xhtml">
+
+    <xsl:output method="xml" indent="yes" omit-xml-declaration="yes" saxon:line-length="1000"/>
+    <xsl:strip-space elements="*"/>
+
+    <xsl:variable name="ev"/>
+    <xsl:variable name="x"/>
+
+    <xsl:variable name="isbn" as="xs:string" select="replace(document-uri(), '.*([0-9]{13,}).*' , '$1')"/>
+
+
+    <xsl:variable name="dnbBookdata">
+        <xsl:copy-of select="doc(concat('https://services.dnb.de/sru/dnb?version=1.1&amp;operation=searchRetrieve&amp;query=NUM%3D', $isbn, '&amp;recordSchema=oai_dc'))"/>
+    </xsl:variable>
+
+    <xsl:variable name="autor"
+        select="string-join(replace($dnbBookdata//dc:creator, ' *\[[^\]]*\]', ''), ' ; ')"/>
+
+    <xsl:variable name="straight_autor" select="replace(hlu:reversedAuthors($autor), ',', '')"/>
+
+    <xsl:variable name="ina"/>
+    <xsl:variable name="_corpus"/>
+    <xsl:variable name="ent_known"/>
+
+
+    <!-- added HLU 2012-02-09: -->
+    <xsl:variable name="ent">
+        <xsl:choose>
+            <xsl:when test="$ent_known">
+                <xsl:value-of select="$ent_known"/>
+            </xsl:when>
+            <xsl:when test="$ev">
+                <xsl:value-of select="$ev"/>
+            </xsl:when>
+            <xsl:otherwise>
+                <xsl:value-of select="$j"/>
+            </xsl:otherwise>
+        </xsl:choose>
+    </xsl:variable>
+
+    <xsl:variable name="titel">
+        <xsl:choose>
+            <xsl:when test="contains($dnbBookdata//dc:title,':')">
+                <xsl:value-of select="normalize-space(substring-before(substring-before($dnbBookdata//dc:title, '/'), ':'))"
+                    />
+            </xsl:when>
+            <xsl:otherwise>
+                <xsl:value-of select="normalize-space(substring-before($dnbBookdata//dc:title, '/'))"/>
+            </xsl:otherwise>
+        </xsl:choose>
+    </xsl:variable>
+
+    <xsl:variable name="erscheinungsort">
+        <xsl:choose>
+            <xsl:when test="contains($dnbBookdata//dc:publisher,':')">
+                <xsl:value-of select="normalize-space(substring-before($dnbBookdata//dc:publisher, ':'))"/>
+            </xsl:when>
+            <xsl:otherwise>
+                <xsl:value-of select="normalize-space($dnbBookdata//dc:publisher)"/>
+            </xsl:otherwise>
+        </xsl:choose>
+    </xsl:variable>
+
+    <xsl:variable name="verlag">
+        <xsl:choose>
+            <xsl:when test="contains($dnbBookdata//dc:publisher,':')">
+                <xsl:value-of select="normalize-space(substring-after($dnbBookdata//dc:publisher, ':'))"/>
+            </xsl:when>
+            <xsl:otherwise>
+                <xsl:value-of select="normalize-space($dnbBookdata//dc:publisher)"/>
+            </xsl:otherwise>
+        </xsl:choose>
+    </xsl:variable>
+
+    <xsl:variable name="erscheinungsjahr">
+        <xsl:choose>
+            <xsl:when test="matches($dnbBookdata//dc:date, '^[0-9]{4}$')">
+                <xsl:value-of select="$dnbBookdata//dc:date"/>
+            </xsl:when>
+            <xsl:otherwise>
+                <xsl:value-of select="substring-before($dnbBookdata//dc:date, '-')"/>
+            </xsl:otherwise>
+        </xsl:choose>
+    </xsl:variable>
+
+    <xsl:variable name="untertitel"
+        select="normalize-space(substring-after(substring-before($dnbBookdata//dc:title, '/'), ':'))"/>
+
+    <xsl:variable name="herausgeber">
+        <xsl:choose>
+            <xsl:when test="$dnbBookdata//dc:creator[ends-with(.,'[Hrsg.]')]">
+                <xsl:value-of
+                    select="replace(string-join($dnbBookdata//dc:creator[ends-with(.,'[Hrsg.]')], ' ; '),'\s?\[Hrsg.\]','')"
+                    />
+            </xsl:when>
+            <xsl:otherwise>.</xsl:otherwise>
+        </xsl:choose>
+    </xsl:variable>
+
+    <xsl:variable name="straight_herausgeber"
+        select="replace(hlu:reversedAuthors($herausgeber), ',', '')"/>
+
+    <xsl:variable name="j" select="$dnbBookdata//dc:date"/>
+
+    <!-- for BOT+s: -->
+    <xsl:variable name="seiten" select="replace($dnbBookdata//dc:format,'S\.','')"/>
+
+    <!-- fuer BOT+b: -->
+    <xsl:variable name="_b">
+        <xsl:variable name="regexp1" select="'(Band|Bd\.)\s*([0-9]?[0-9]?[0-9])'"/>
+        <xsl:choose>
+            <xsl:when test="matches($dnbBookdata, $regexp1)">
+                <xsl:analyze-string select="$dnbBookdata//dc:title" regex="{$regexp1}">
+                    <xsl:matching-substring>
+                        <xsl:value-of select="."/>
+                    </xsl:matching-substring>
+                </xsl:analyze-string>
+            </xsl:when>
+            <xsl:otherwise>
+                <xsl:value-of select="'.'"/>
+            </xsl:otherwise>
+        </xsl:choose>
+    </xsl:variable>
+
+    <!-- for BOT+x: -->
+    <xsl:variable name="txtart">
+        <xsl:choose>
+            <xsl:when test="$x">
+                <xsl:value-of select="concat('[', $x, ']')"/>
+            </xsl:when>
+            <xsl:when
+                test="matches($untertitel, '([Rr]oman|[Ee]rzhlung(en)?|[Aa]nthologie|[Gg]eschichte(n)?|[Nn]ovelle)')">
+                <xsl:value-of
+                    select="concat('[', replace(replace($untertitel, '.*?(((^|\P{L})\p{L}+)?([Rr]oman|[Ee]rzhlung(en)?|[Aa]nthologie|[Gg]eschichte(n)?|[Nn]ovelle)).*', '$1'), '\P{L}*(.+)', '$1'), ']')"
+                    />
+            </xsl:when>
+            <xsl:otherwise>
+                <xsl:value-of>Roman</xsl:value-of>
+            </xsl:otherwise>
+        </xsl:choose>
+    </xsl:variable>
+
+
+
+    <!-- fuer BOTd: -->
+    <xsl:variable name="dok"
+        select="concat((if(string-length($autor) &gt; 0) then concat($straight_autor, ': ') else ''), $titel, ', ', $txtart, ', (', $j, ')')"/>
+
+    <!-- END variables derived from sru request to dnb archive -->
+
+
+    <xsl:variable name="corpus_sigle" select="'DNB'"/>
+
+    <!-- for BOTD: -->
+    <!-- Dokumentsigle muss zusammen mit Korpussigle (z.B DIV fuer loz-div und loz-div-pub) eindeutig sein -->
+    <xsl:variable name="doc_sigle">
+        <xsl:variable name="firstContentWordTitleInitial">
+            <xsl:variable name="helper">
+                <xsl:analyze-string select="$titel" regex="\w+">
+                    <xsl:matching-substring>
+                        <xsl:choose>
+                            <xsl:when
+                                test="matches(.,'^[A-Z]') and not(matches(.,'^(Der|Die|Das|Des|Ein|Eine|Eines|Einmal|Von|Mit|Zu|Zurck)$'))">
+                                <!-- TODO: Fktnswoerter nachtragen -->
+                                <xsl:sequence select="."/>
+                            </xsl:when>
+                            <xsl:otherwise/>
+                        </xsl:choose>
+                    </xsl:matching-substring>
+                </xsl:analyze-string>
+            </xsl:variable>
+            <xsl:value-of
+                select="upper-case(substring(normalize-space(replace($helper,'\s+.+$','')),1,1))"/>
+            <!-- longest match of .+  -->
+        </xsl:variable>
+        <xsl:choose>
+            <xsl:when test="contains($autor,';')">
+                <xsl:variable name="lastname_aut1"
+                    select="upper-case(substring(normalize-space(substring-before(substring-before($autor,';'),',')), 1, 1))"/>
+                <xsl:variable name="lastname_aut2"
+                    select="upper-case(substring(normalize-space(substring-before(substring-before(substring-after($autor, ';'),';'),',')), 1, 1))"/>
+                <xsl:value-of select="concat($lastname_aut1, $lastname_aut2)"/>
+            </xsl:when>
+            <xsl:otherwise>
+                <xsl:variable name="lastname_aut1"
+                    select="upper-case(substring(normalize-space(substring-before($autor,',')),1,1))"/>
+                <xsl:variable name="firstname_aut1"
+                    select="upper-case(substring(normalize-space(substring-after($autor,',')),1,1))"/>
+                <xsl:value-of select="concat($lastname_aut1, $firstname_aut1)"/>
+            </xsl:otherwise>
+        </xsl:choose>
+        <xsl:value-of select="$firstContentWordTitleInitial"/>
+    </xsl:variable>
+
+
+    <xsl:variable name="text_sigle" select="substring($isbn, 8, 5)"/>
+    <xsl:variable name="sigle" select="concat($corpus_sigle, '/', $doc_sigle, '.', $text_sigle)"/>
+
+    <!-- fuer BOT+xy: (?) -->
+    <xsl:variable name="xyref">
+        <xsl:value-of select="document-uri(.)"/>
+        <xsl:text>; </xsl:text>
+        <xsl:text>ISBN:</xsl:text>
+        <xsl:value-of select="$isbn"/>
+        <xsl:text>; </xsl:text>
+        <xsl:value-of select="string-join($dnbBookdata//dc:identifier)"/>
+    </xsl:variable>
+
+
+    <xsl:template match="/">
+        <idsDoc TEIform="TEI.2" type="text" version="1.0">
+            <idsHeader TEIform="teiHeader" pattern="text" status="new" type="document" version="1.1">
+                <fileDesc>
+                    <titleStmt>
+                        <dokumentSigle>KJL/HRK</dokumentSigle>
+                        <d.title>Christiane F. ; Kai Hermann ; Horst Rieck: Wir Kinder vom Bahnhof Zoo , [Jugendliteratur], (2011)</d.title>
+                    </titleStmt>
+                    <publicationStmt>
+                        <distributor/>
+                        <pubAddress/>
+                        <availability region="world" status="unknown">QAO-NC</availability>
+                        <pubDate/>
+                    </publicationStmt>
+                    <sourceDesc>
+                        <biblStruct>
+                            <monogr>
+                                <h.title type="main"/>
+                                <imprint/>
+                            </monogr>
+                        </biblStruct>
+                    </sourceDesc>
+                </fileDesc>
+            </idsHeader>
+            <idsText version="1.0">
+                <idsHeader TEIform="teiHeader" pattern="text" status="new" type="text" version="1.1">
+                    <fileDesc>
+                        <titleStmt>
+                            <textSigle><xsl:sequence select="$sigle"/></textSigle>
+                            <t.title assemblage="regular">KJL/HRK.00001 F., Christiane ; Hermann, Kai ; Rieck, Horst: Wir Kinder vom Bahnhof Zoo, [Jugendliteratur], Erstv. 1978. - Hamburg, 2011</t.title>
+                        </titleStmt>
+                        <publicationStmt>
+                            <distributor/>
+                            <pubAddress/>
+                            <availability region="world" status="unknown">QAO-NC</availability>
+                            <pubDate/>
+                        </publicationStmt>
+                        <sourceDesc>
+                            <biblStruct>
+                                <monogr>
+                                    <h.title type="main"><xsl:value-of select="$titel"/></h.title>
+                                    <h.title type="sub"><xsl:value-of select="$untertitel"/></h.title>
+                                    <h.author><xsl:value-of select="$autor"/></h.author>
+                                    <editor/>
+                                    <edition>
+                                        <further/>
+                                        <kind>E-Book-Ausgabe</kind>
+                                        <appearance>EPUB-Datei</appearance>
+                                    </edition>
+                                    <imprint>
+                                        <publisher><xsl:value-of select="$herausgeber"/></publisher>
+                                        <pubDate type="year"><xsl:value-of select="$j"/></pubDate>
+                                        <pubDate type="month"/>
+                                        <pubDate type="day"/>
+                                        <pubPlace key="DE"><xsl:value-of select="$erscheinungsort"/></pubPlace>
+                                    </imprint>
+                                    <biblScope type="subsume"/>
+                                    <biblScope type="pp"/>
+                                    <biblScope type="vol"/>
+                                    <biblScope type="volume-title"/>
+                                </monogr>
+                            </biblStruct>
+                            <reference assemblage="regular" type="complete"><xsl:value-of select="concat($sigle, ' ', $autor, ': ', $titel, '. ', $erscheinungsort, ': ', $verlag, ', ', $erscheinungsjahr)"/></reference>
+                            <reference assemblage="regular" type="short">KJL/HRK.00001 F. ; Hermann ; Rieck: Wir Kinder vom Bahnhof Zoo, Erstv. 1978, 2011</reference>
+                        </sourceDesc>
+                    </fileDesc>
+                    <profileDesc>
+                        <creation>
+                            <creatDate><xsl:value-of select="$j"/></creatDate>
+                        </creation>
+                        <textClass/>
+                        <textDesc>
+                            <textType>Jugendliteratur</textType>
+                            <textTypeRef>Jugendliteratur</textTypeRef>
+                            <textDomain/>
+                        </textDesc>
+                    </profileDesc>
+                </idsHeader>
+                <text>
+                   <body>
+                    <!-- Call the template for each link in the TOC 
+                         <xsl:apply-templates select="//xhtml:ol[@class='toc']/xhtml:li/xhtml:a" mode="collect"/> -->
+                    <xsl:apply-templates select="//opf:package/opf:manifest/opf:item[matches(@href, '\.x?html$') and not(matches(@href, '(cover|toc|copyright|feedback).*'))]" mode="collect"/>
+                    </body>
+                </text>
+            </idsText>
+        </idsDoc>
+    </xsl:template>
+
+    <xsl:template match="opf:item" mode="collect">
+        <xsl:variable name="href" select="@href"/>
+        <xsl:message>
+            <xsl:text>converting: </xsl:text><xsl:value-of select="$href"/><xsl:text> </xsl:text><xsl:value-of select="$isbn"/>
+        </xsl:message>
+        <xsl:apply-templates select="doc(resolve-uri($href, base-uri()))/xhtml:html/xhtml:body"/>
+    </xsl:template>
+
+    <xsl:template match="xhtml:body">
+        <div type="chapter">
+            <xsl:apply-templates/>
+        </div>
+    </xsl:template>
+
+    <xsl:template match="xhtml:title">
+        <head>
+            <xsl:apply-templates/>
+        </head>
+    </xsl:template>
+
+    <xsl:template match="xhtml:h1">
+        <head>
+            <xsl:apply-templates/>
+        </head>
+    </xsl:template>
+
+    <xsl:template match="xhtml:h2|xhtml:h3">
+        <head type="sub">
+            <xsl:apply-templates/>
+        </head>
+    </xsl:template>
+
+
+    <xsl:template match="xhtml:span[@class='italic']">
+        <hi rend="italic">
+            <xsl:apply-templates/>
+        </hi>
+    </xsl:template>
+
+    <xsl:template match="xhtml:span[@class='bold']">
+        <hi rend="bold">
+            <xsl:apply-templates/>
+        </hi>
+    </xsl:template>
+
+    <xsl:template match="xhtml:span[@class='sub']">
+        <hi rend="sub">
+            <xsl:apply-templates/>
+        </hi>
+    </xsl:template>
+
+    <xsl:template match="xhtml:span[@class='sup']">
+        <hi rend="sup">
+            <xsl:apply-templates/>
+        </hi>
+    </xsl:template>
+
+    <xsl:template match="xhtml:div">
+        <div type="section">
+            <xsl:apply-templates/>
+        </div>
+    </xsl:template>
+
+    <xsl:template match="xhtml:p">
+        <p>
+            <xsl:apply-templates/>
+        </p>
+    </xsl:template>
+
+    <xsl:template match="xhtml:img">
+        <!-- <gap reason="image"/>  -->
+    </xsl:template>
+
+    <xsl:template match="xhtml:*">
+        <xsl:message>
+            <xsl:text>unhandled element: </xsl:text><xsl:value-of select="concat(name(), ' ', string-join(./@*[normalize-space(.) != '']/concat(name(), ':', ., ' '), '_'))"/>
+        </xsl:message>
+        <xsl:apply-templates/>
+    </xsl:template>
+
+    <xsl:function name="ids:reversedAuthors">
+        <xsl:param name="s" />
+        <xsl:value-of
+            select="
+                if (matches($s, ';')) then
+                    concat(ids:reversedAuthors(substring-before($s, ' ; ')), ' ; ', ids:reversedAuthors(substring-after($s, ' ; ')))
+                else
+                    replace($s, '(.+) (.+)', '$2, $1')"
+            />
+    </xsl:function>
+
+    <xsl:function name="hlu:reversedAuthors">
+        <xsl:param name="s"/>
+        <xsl:value-of
+            select="if (matches($s, ';')) then concat(ids:reversedAuthors(substring-before($s, ' ; ')), ' ; ', ids:reversedAuthors(substring-after($s, ' ; '))) else replace($s, '(.+),(.+)', '$2, $1')"
+            />
+    </xsl:function>
+
+</xsl:stylesheet>

diff --git a/xslt/idsCorpus-template.xml b/xslt/idsCorpus-template.xml
new file mode 100644
index 0000000..7ff9f0f
--- /dev/null
+++ b/xslt/idsCorpus-template.xml

@@ -0,0 +1,242 @@
+<?xml version="1.0" encoding="utf-8"?>
+<!DOCTYPE idsCorpus PUBLIC "-//IDS//DTD IDS-I5 1.0//EN" "http://corpora.ids-mannheim.de/I5/DTD/i5.dtd">
+<idsCorpus xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" TEIform="teiCorpus.2" version="1.0">
+  <idsHeader TEIform="teiHeader" pattern="allesaußerZtg/Zschr" status="new" type="corpus" version="1.1">
+    <fileDesc>
+      <titleStmt>
+        <korpusSigle>DNB</korpusSigle>
+        <c.title>Deutschsprachige Belletristik</c.title>
+      </titleStmt>
+      <publicationStmt>
+        <distributor>		Institut für Deutsche Sprache		</distributor>
+        <pubAddress>		Postfach 10 16 21, D-68016 Mannheim	</pubAddress>
+        <telephone>		+49 (0)621 1581 0			</telephone>
+        <availability region="world" status="unknown">QAO-NC</availability>
+        <pubDate/>
+      </publicationStmt>
+      <sourceDesc>
+        <biblStruct>
+          <monogr>
+            <h.title type="main"/>
+            <imprint/>
+          </monogr>
+        </biblStruct>
+      </sourceDesc>
+    </fileDesc>
+    <encodingDesc>
+      <projectDesc/>
+      <samplingDecl/>
+      <editorialDecl>
+        <transduction>korap4dnb</transduction>
+        <pagination type="no"/>
+      </editorialDecl>
+      <classDecl>
+        <taxonomy id="topic">
+          <h.bibl>Thementaxonomie (siehe http://www.ids-mannheim.de/kl/projekte/methoden/te.html)</h.bibl>
+          <category id="topic.fiktion">
+            <catDesc>Fiktion</catDesc>
+            <category id="topic.fiktion.vermischtes">
+              <catDesc>Fiktion:Vermischtes</catDesc>
+            </category>
+          </category>
+          <category id="topic.freizeit-unterhaltung">
+            <catDesc>Freizeit_Unterhaltung</catDesc>
+            <category id="topic.freizeit-unterhaltung.reisen">
+              <catDesc>Freizeit_Unterhaltung:Reisen</catDesc>
+            </category>
+            <category id="topic.freizeit-unterhaltung.rundfunk">
+              <catDesc>Freizeit_Unterhaltung:Rundfunk</catDesc>
+            </category>
+            <category id="topic.freizeit-unterhaltung.vereine-veranstaltungen">
+              <catDesc>Freizeit_Unterhaltung:Vereine_Veranstaltungen</catDesc>
+            </category>
+          </category>
+          <category id="topic.gesundheit-ernaehrung">
+            <catDesc>Gesundheit_Ernaehrung</catDesc>
+            <category id="topic.gesundheit-ernaehrung.ernaehrung">
+              <catDesc>Gesundheit_Ernaehrung:Ernaehrung</catDesc>
+            </category>
+            <category id="topic.gesundheit-ernaehrung.gesundheit">
+              <catDesc>Gesundheit_Ernaehrung:Gesundheit</catDesc>
+            </category>
+          </category>
+          <category id="topic.kultur">
+            <catDesc>Kultur</catDesc>
+            <category id="topic.kultur.bildende-kunst">
+              <catDesc>Kultur:Bildende Kunst</catDesc>
+            </category>
+            <category id="topic.kultur.darstellende-kunst">
+              <catDesc>Kultur:Darstellende Kunst</catDesc>
+            </category>
+            <category id="topic.kultur.film">
+              <catDesc>Kultur:Film</catDesc>
+            </category>
+            <category id="topic.kultur.literatur">
+              <catDesc>Kultur:Literatur</catDesc>
+            </category>
+            <category id="topic.kultur.mode">
+              <catDesc>Kultur:Mode</catDesc>
+            </category>
+            <category id="topic.kultur.musik">
+              <catDesc>Kultur:Musik</catDesc>
+            </category>
+          </category>
+          <category id="topic.natur-umwelt">
+            <catDesc>Natur_Umwelt</catDesc>
+            <category id="topic.natur-umwelt.garten">
+              <catDesc>Natur_Umwelt:Garten</catDesc>
+            </category>
+            <category id="topic.natur-umwelt.tiere">
+              <catDesc>Natur_Umwelt:Tiere</catDesc>
+            </category>
+            <category id="topic.natur-umwelt.wetter-klima">
+              <catDesc>Natur_Umwelt:Wetter_Klima</catDesc>
+            </category>
+          </category>
+          <category id="topic.politik">
+            <catDesc>Politik</catDesc>
+            <category id="topic.politik.ausland">
+              <catDesc>Politik:Ausland</catDesc>
+            </category>
+            <category id="topic.politik.inland">
+              <catDesc>Politik:Inland</catDesc>
+            </category>
+            <category id="topic.politik.kommunalpolitik">
+              <catDesc>Politik:Kommunalpolitik</catDesc>
+            </category>
+          </category>
+          <category id="topic.rest">
+            <catDesc>Rest</catDesc>
+            <category id="topic.rest.boersenkurse">
+              <catDesc>Rest:boersenkurse</catDesc>
+            </category>
+            <category id="topic.rest.geburt-tod-heirat">
+              <catDesc>Rest:geburt_tod_heirat</catDesc>
+            </category>
+            <category id="topic.rest.impressum">
+              <catDesc>Rest:impressum</catDesc>
+            </category>
+            <category id="topic.rest.inhaltsverzeichnisse">
+              <catDesc>Rest:inhaltsverzeichnisse</catDesc>
+            </category>
+            <category id="topic.rest.ligatabellen">
+              <catDesc>Rest:ligatabellen</catDesc>
+            </category>
+            <category id="topic.rest.tabellen">
+              <catDesc>Rest:tabellen</catDesc>
+            </category>
+            <category id="topic.rest.veranstaltungshinweise">
+              <catDesc>Rest:veranstaltungshinweise</catDesc>
+            </category>
+          </category>
+          <category id="topic.sport">
+            <catDesc>Sport</catDesc>
+            <category id="topic.sport.ballsport">
+              <catDesc>Sport:Ballsport</catDesc>
+            </category>
+            <category id="topic.sport.fussball">
+              <catDesc>Sport:Fussball</catDesc>
+            </category>
+            <category id="topic.sport.motorsport">
+              <catDesc>Sport:Motorsport</catDesc>
+            </category>
+            <category id="topic.sport.radsport">
+              <catDesc>Sport:Radsport</catDesc>
+            </category>
+            <category id="topic.sport.tennis">
+              <catDesc>Sport:Tennis</catDesc>
+            </category>
+            <category id="topic.sport.vermischtes">
+              <catDesc>Sport:Vermischtes</catDesc>
+            </category>
+            <category id="topic.sport.wintersport">
+              <catDesc>Sport:Wintersport</catDesc>
+            </category>
+          </category>
+          <category id="topic.staat-gesellschaft">
+            <catDesc>Staat_Gesellschaft</catDesc>
+            <category id="topic.staat-gesellschaft.arbeit-und-beruf">
+              <catDesc>Staat_Gesellschaft:Arbeit_und_Beruf</catDesc>
+            </category>
+            <category id="topic.staat-gesellschaft.bildung">
+              <catDesc>Staat_Gesellschaft:Bildung</catDesc>
+            </category>
+            <category id="topic.staat-gesellschaft.biographien-interviews">
+              <catDesc>Staat_Gesellschaft:Biographien_Interviews</catDesc>
+            </category>
+            <category id="topic.staat-gesellschaft.drittes-reich-rechtsextremismus">
+              <catDesc>Staat_Gesellschaft:Drittes_Reich_Rechtsextremismus</catDesc>
+            </category>
+            <category id="topic.staat-gesellschaft.familie-geschlecht">
+              <catDesc>Staat_Gesellschaft:Familie_Geschlecht</catDesc>
+            </category>
+            <category id="topic.staat-gesellschaft.kirche">
+              <catDesc>Staat_Gesellschaft:Kirche</catDesc>
+            </category>
+            <category id="topic.staat-gesellschaft.recht">
+              <catDesc>Staat_Gesellschaft:Recht</catDesc>
+            </category>
+            <category id="topic.staat-gesellschaft.tod">
+              <catDesc>Staat_Gesellschaft:Tod</catDesc>
+            </category>
+            <category id="topic.staat-gesellschaft.verbrechen">
+              <catDesc>Staat_Gesellschaft:Verbrechen</catDesc>
+            </category>
+          </category>
+          <category id="topic.technik-industrie">
+            <catDesc>Technik_Industrie</catDesc>
+            <category id="topic.technik-industrie.edv-elektronik">
+              <catDesc>Technik_Industrie:EDV_Elektronik</catDesc>
+            </category>
+            <category id="topic.technik-industrie.kfz">
+              <catDesc>Technik_Industrie:Kfz</catDesc>
+            </category>
+            <category id="topic.technik-industrie.transport-verkehr">
+              <catDesc>Technik_Industrie:Transport_Verkehr</catDesc>
+            </category>
+            <category id="topic.technik-industrie.umweltschutz">
+              <catDesc>Technik_Industrie:Umweltschutz</catDesc>
+            </category>
+            <category id="topic.technik-industrie.unfaelle">
+              <catDesc>Technik_Industrie:Unfaelle</catDesc>
+            </category>
+          </category>
+          <category id="topic.wirtschaft-finanzen">
+            <catDesc>Wirtschaft_Finanzen</catDesc>
+            <category id="topic.wirtschaft-finanzen.banken">
+              <catDesc>Wirtschaft_Finanzen:Banken</catDesc>
+            </category>
+            <category id="topic.wirtschaft-finanzen.bilanzen">
+              <catDesc>Wirtschaft_Finanzen:Bilanzen</catDesc>
+            </category>
+            <category id="topic.wirtschaft-finanzen.oeffentliche-finanzen">
+              <catDesc>Wirtschaft_Finanzen:Oeffentliche_Finanzen</catDesc>
+            </category>
+            <category id="topic.wirtschaft-finanzen.sozialprodukt">
+              <catDesc>Wirtschaft_Finanzen:Sozialprodukt</catDesc>
+            </category>
+            <category id="topic.wirtschaft-finanzen.waehrung">
+              <catDesc>Wirtschaft_Finanzen:Waehrung</catDesc>
+            </category>
+          </category>
+          <category id="topic.wissenschaft">
+            <catDesc>Wissenschaft</catDesc>
+            <category id="topic.wissenschaft.populaerwissenschaft">
+              <catDesc>Wissenschaft:Populaerwissenschaft</catDesc>
+            </category>
+          </category>
+          <category id="topic.unklassifizierbar">
+            <catDesc>Text ist thematisch nicht klassifizierbar.</catDesc>
+          </category>
+        </taxonomy>
+      </classDecl>
+    </encodingDesc>
+    <profileDesc>
+      <langUsage>
+        <language id="de" usage="100">Deutsch</language>
+      </langUsage>
+      <textDesc/>
+    </profileDesc>
+  </idsHeader>
+
+</idsCorpus>
commit	1a42266168a011af3e8c592221c4c9f8118700d0	[log] [tgz]
author	Marc Kupietz <kupietz@ids-mannheim.de>	Sat Mar 16 09:34:10 2024 +0100
committer	Marc Kupietz <kupietz@ids-mannheim.de>	Sat Mar 16 09:34:10 2024 +0100
tree	920b02f23fcc68246d301a036ce41aa0fb4fe7ad
parent	7747c11f1152645b938c487f38e16b6b91b12d4f [diff]