just saving the next step
Change-Id: I899cb4427ca26e327fb9f3f34fec653dd0770fd7
diff --git a/nkjp2korap.xsl b/nkjp2korap.xsl
index 03f66f3..acc425d 100644
--- a/nkjp2korap.xsl
+++ b/nkjp2korap.xsl
@@ -2,6 +2,7 @@
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:xs="http://www.w3.org/2001/XMLSchema" xmlns:nkjp="http://www.nkjp.pl/ns/1.0"
xmlns:tei="http://www.tei-c.org/ns/1.0" xmlns:f="func"
+ xmlns:fn="http://www.w3.org/2005/xpath-functions"
xmlns:map="http://www.w3.org/2005/xpath-functions/map" exclude-result-prefixes="xs f map nkjp tei"
version="3.0" expand-text="yes">
@@ -260,7 +261,11 @@
<xsl:param name="target" as="xs:string"/>
-<!-- map the entire document, so that the processing only takes place once, and for fast lookups -->
+<!-- map the entire document, so that the processing only takes place once, and for fast lookups
+
+ MOVE THIS UP TO process_single_sample - to re-use it
+
+ -->
<xsl:variable name="map_w" as="map(xs:untypedAtomic,item()+)">
<xsl:variable name="segs" select="$ann_segmentation.xml/tei:teiCorpus/tei:TEI/tei:text/tei:body/tei:p/tei:s//tei:seg[count(@nkjp:rejected) eq 0]" as="element()+"/>
<xsl:map>
@@ -276,7 +281,78 @@
</xsl:map>
</xsl:variable>
- <xsl:message select="'size: ' || map:size($map_w)"/>
+ <xsl:variable name="map_s-p" as="map(xs:untypedAtomic,item()+)">
+ <xsl:map>
+ <xsl:for-each-group select="map:keys($map_w)" group-by="map:get($map_w, .)[1]">
+ <xsl:sort select="xs:integer(substring-before(substring-after(., 'segm_'), '.'))"
+ order="ascending"/>
+ <xsl:sort select="xs:integer(substring-before(substring-after(., '.'), '-'))"
+ order="ascending"/>
+ <xsl:variable name="current-p-pos" select="fn:position()" as="xs:integer"/>
+ <!-- the above is used in the sentence loop, when we check if it's text-initial -->
+ <xsl:variable name="current-p" select="fn:current-grouping-key()" as="xs:string"/> <!--xs:untypedAtomic-->
+ <xsl:variable name="p-length" select="
+ sum(for $id in current-group()
+ return
+ string-length(map:get($map_w, $id)[5])) + count(current-group()) - xs:integer(position() ne 1) -
+ count(fn:filter(current-group(), function ($w-id) {
+ map:get($map_w, $w-id)[4] eq false()
+ }))"/>
+ <!-- The general algorithm is:
+ * count and sum the lengths of all the words
+ * add 'whitespace' for all of them (= count them and add that), and then
+ * subtract whitespace for those of them that are not actually preceded by it
+ and if the 1st word is_preceded_by_ws then subtract 1
+ because identifying that 1st word would require an extra step, we're taking a shortcut via position() -
+ and that strongly depends on the presence of the xsl:sort instructions -->
+
+ <xsl:message select="'sum: ' || sum( for $id in current-group() return string-length(map:get($map_w, $id)[5]) )"/>
+ <!--<xsl:message select="for $id in current-group() return (string-length(map:get($map_w, $id)[5]),map:get($map_w, $id)[4] )"/>-->
+ <xsl:message select="'cur-group count: ' || count(fn:current-group())"/>
+ <!--<xsl:message select="fn:for-each(current-group(), function($w-id) { map:get($map_w,$w-id)[4] eq false() } )"></xsl:message>-->
+ <xsl:message select="'subtract:' || count(fn:filter(current-group(), function($w-id) { map:get($map_w,$w-id)[4] eq false() } ))"></xsl:message>
+ <xsl:message select="'position: ' || position() || ', xs:integer(position() ne 1)=' || xs:integer(position() ne 1)"></xsl:message>
+ <xsl:message select="'p-length: ' || $p-length"/>
+
+ <xsl:map-entry key="current-grouping-key()" select="'p', position(), $p-length"/>
+
+ <xsl:message select="'p: ', $current-p || ' pos:' || position(), current-group()"/>
+
+ <xsl:for-each-group select="current-group()" group-by="map:get($map_w, .)[2]">
+ <xsl:sort select="xs:integer(substring-before(substring-after(., 'segm_'), '.'))"
+ order="ascending"/>
+ <xsl:sort select="xs:integer(substring-before(substring-after(., '.'), '-'))"
+ order="ascending"/>
+ <xsl:variable name="current-s" select="fn:current-grouping-key()" as="xs:string"/> <!--xs:untypedAtomic-->
+ <xsl:variable name="s-length" select="
+ sum(for $id in current-group()
+ return
+ string-length(map:get($map_w, $id)[5])) + count(current-group()) - xs:integer($current-p-pos ne 1) -
+ count(fn:filter(current-group(), function ($w-id) {
+ map:get($map_w, $w-id)[4] eq false()
+ }))"/>
+
+
+ <xsl:map-entry key="current-grouping-key()" select="'s', position(), $s-length, $current-p"/>
+
+ <xsl:message select="'s: ', position(), current-group()"/>
+
+ <xsl:for-each select="current-group()">
+ <xsl:sort select="map:get($map_w, .)[3]" order="ascending"/>
+ <xsl:map-entry key="." select="'w', position(), string-length(map:get($map_w, .)[5]), $current-s, map:get($map_w, .)[4]"/>
+<!-- <xsl:message select="map:get($map_w, .)[5]"/>-->
+ </xsl:for-each>
+
+ </xsl:for-each-group>
+
+
+
+ </xsl:for-each-group>
+ </xsl:map>
+ </xsl:variable>
+
+ <xsl:message select="'map_w size: ' || map:size($map_w)"/>
+ <xsl:message select="'map_s-p size: ' || map:size($map_s-p)"/>
<xsl:result-document encoding="UTF-8" method="xml" indent="yes"
xpath-default-namespace="http://ids-mannheim.de/ns/KorAP" href="{$target}">