Add topic domain classification in XSLT pass2
Generated with mallet based on the old training data in /vol/work/TE via
calling a Java function from XSLT.
Resolves #6
diff --git a/xslt/pass2.xsl b/xslt/pass2.xsl
index 697c108..087fb78 100644
--- a/xslt/pass2.xsl
+++ b/xslt/pass2.xsl
@@ -1,14 +1,28 @@
<xsl:stylesheet version="3.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:saxon="http://saxon.sf.net/"
- exclude-result-prefixes="saxon">
+ xmlns:xs="http://www.w3.org/2001/XMLSchema"
+ xmlns:TextClassifier="java:de.ids_mannheim.TextClassifier"
+ exclude-result-prefixes="saxon xs TextClassifier">
<xsl:output method="xml" indent="yes" saxon:line-length="1000"
doctype-public="-//IDS//DTD IDS-I5 1.0//EN"
doctype-system="http://corpora.ids-mannheim.de/I5/DTD/i5.dtd"
/>
+ <xsl:variable name="domainClassifier" select="TextClassifier:new('models/dereko_domains_s.classifier')"/>
+
<xsl:mode on-no-match="shallow-copy"/>
+ <xsl:template match="textClass">
+ <xsl:variable name="classification" select="tokenize(TextClassifier:topicDomainsFromText($domainClassifier, ../../../text), ';')"/>
+ <textClass>
+ <catRef n="{$classification[1]}" target="{$classification[2]}" scheme="topic"/>
+ <xsl:if test="xs:decimal($classification[3]) > 0.0000001">
+ <catRef n="{$classification[3]}" target="{$classification[4]}" scheme="topic"/>
+ </xsl:if>
+ </textClass>
+ </xsl:template>
+
<xsl:template match="p[not(normalize-space())]" priority="1.0"/>
<xsl:template match="div[not(normalize-space())]" priority="1.0"/>