Add topic domain classification in XSLT pass2
Generated with mallet based on the old training data in /vol/work/TE via
calling a Java function from XSLT.
Resolves #6
diff --git a/Makefile b/Makefile
index 9b38bed..674c69a 100644
--- a/Makefile
+++ b/Makefile
@@ -17,7 +17,7 @@
MAKE ?= make -j $(shell nproc)
KORAPXML2CONLLU_HEAP ?= $(shell echo "$$(($(MAX_THREADS) * 2100))")
KORAPXML2CONLLU ?= java -Xmx$(KORAPXML2CONLLU_HEAP)m -jar lib/korapxml2conllu.jar
-SAXON ?= java -cp lib/saxon9ee.jar:lib/xml-resolver-1.2.jar net.sf.saxon.Transform -expand:off -catalog:"lib/dtds/xhtml11/xhtmlcatalog.xml;lib/dtds/xhtml/dtd/xhtmlcatalog.xml"
+SAXON ?= java -cp lib/saxon9ee.jar:lib/xml-resolver-1.2.jar:lib/textclassifier.jar net.sf.saxon.Transform -expand:off -catalog:"lib/dtds/xhtml11/xhtmlcatalog.xml;lib/dtds/xhtml/dtd/xhtmlcatalog.xml"
.DELETE_ON_ERROR:
@@ -92,6 +92,10 @@
mkdir -p models
curl -sL -o $@ https://corpora.ids-mannheim.de/tools/$@
+models/dereko_domains_s.classifier:
+ mkdir -p models
+ curl -sL -o $@ https://corpora.ids-mannheim.de/tools/$@
+
%.marmot-malt.zip: %.zip models/de.marmot models/german.mco
$(KORAPXML2CONLLU) -T $(MAX_THREADS) -t marmot:models/de.marmot -P malt:models/german.mco $< | conllu2korapxml > $@
diff --git a/lib/textclassifier.jar b/lib/textclassifier.jar
new file mode 100644
index 0000000..ef38851
--- /dev/null
+++ b/lib/textclassifier.jar
Binary files differ
diff --git a/xslt/pass2.xsl b/xslt/pass2.xsl
index 697c108..087fb78 100644
--- a/xslt/pass2.xsl
+++ b/xslt/pass2.xsl
@@ -1,14 +1,28 @@
<xsl:stylesheet version="3.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:saxon="http://saxon.sf.net/"
- exclude-result-prefixes="saxon">
+ xmlns:xs="http://www.w3.org/2001/XMLSchema"
+ xmlns:TextClassifier="java:de.ids_mannheim.TextClassifier"
+ exclude-result-prefixes="saxon xs TextClassifier">
<xsl:output method="xml" indent="yes" saxon:line-length="1000"
doctype-public="-//IDS//DTD IDS-I5 1.0//EN"
doctype-system="http://corpora.ids-mannheim.de/I5/DTD/i5.dtd"
/>
+ <xsl:variable name="domainClassifier" select="TextClassifier:new('models/dereko_domains_s.classifier')"/>
+
<xsl:mode on-no-match="shallow-copy"/>
+ <xsl:template match="textClass">
+ <xsl:variable name="classification" select="tokenize(TextClassifier:topicDomainsFromText($domainClassifier, ../../../text), ';')"/>
+ <textClass>
+ <catRef n="{$classification[1]}" target="{$classification[2]}" scheme="topic"/>
+ <xsl:if test="xs:decimal($classification[3]) > 0.0000001">
+ <catRef n="{$classification[3]}" target="{$classification[4]}" scheme="topic"/>
+ </xsl:if>
+ </textClass>
+ </xsl:template>
+
<xsl:template match="p[not(normalize-space())]" priority="1.0"/>
<xsl:template match="div[not(normalize-space())]" priority="1.0"/>