Testing whitespace handling following the documentation (Skipped)
Change-Id: I85aea14a6d9fbcef4badea7f3ca695fd0645aa05
diff --git a/lib/KorAP/XML/TEI/Inline.pm b/lib/KorAP/XML/TEI/Inline.pm
index d1eafdc..5446deb 100644
--- a/lib/KorAP/XML/TEI/Inline.pm
+++ b/lib/KorAP/XML/TEI/Inline.pm
@@ -386,41 +386,6 @@
echo '<node a="v"><node1>some <n/> text</node1> <node2>more-text</node2></node>' | perl -e 'use XML::CompactTree::XS; use XML::LibXML::Reader; $reader = XML::LibXML::Reader->new(IO => STDIN); $data = XML::CompactTree::XS::readSubtreeToPerl( $reader, XCT_DOCUMENT_ROOT | XCT_IGNORE_COMMENTS | XCT_LINE_NUMBERS ); print "node=\x27".$data->[2]->[0]->[5]->[1]->[1]."\x27, type=".$data->[2]->[0]->[5]->[1]->[0]."\n"'
-Example: '... <head type="main"><s>Campagne in Frankreich</s></head><head type="sub"> <s>1792</s> ...'
-
-Two text-nodes should normally be separated by a blank. In the above example, that would be the 2 text-nodes
- 'Campagne in Frankreich' and '1792', which are separated by the whitespace-node ' ' (see [2]).
-
-The text-node 'Campagne in Frankreich' leads to the setting of '$add_one' to 1, so that when opening the 2nd 'head'-tag,
- it's from-index gets set to the correct start-index of '1792' (and not to the start-index of the whitespace-node ' ').
-
-The assumption here is, that in most cases there _is_ a whitespace node between 2 text-nodes. The below code fragment
- enables a way, to check, if this really _was_ the case for the last 2 'non-tag'-nodes, when closing a tag:
-
-When a whitespace-node is read, its from-index is stored as a hash-key (in %ws), to state that it belongs to a ws-node.
- So when closing a tag, it can be checked, if the previous 'non-tag'-node (text or whitespace), which is the one before
- the last read 'non-tag'-node, was a actually _not_ a ws-node, but instead a text-node. In that case, the from-value of
- the last read 'non-tag'-node has to be corrected (see [1]),
-
-For whitespace-nodes $add_one is set to 0, so when opening the next tag (in the above example the 2nd 's'-tag), no
- additional 1 is added (because this was already done by the whitespace-node itself when incrementing the variable $pos).
-
-[1]
-Now, what happens, when 2 text-nodes are _not_ seperated by a whitespace-node (e.g.: <w>Augen<c>,</c></w>)?
- In this case, the falsely increased from-value has to be decreased again by 1 when closing the enclosing tag
- (see above code fragment '... not exists $ws{ $from - 1 } ...').
-
-[2]
-Comparing the 2 examples '<w>fu</w> <w>bar</w>' and '<w>fu</w><w> </w><w>bar</w>', is ' ' in both cases handled as a
- whitespace-node (XML_READER_TYPE_SIGNIFICANT_WHITESPACE).
-
-The from-index of the 2nd w-tag in the second example refers to 'bar', which may not have been the intention
- (even though '<w> </w>' doesn't make a lot of sense). TODO: could this be a bug?
-
-Empty tags also cling to the next text-token - e.g. in '<w>tok1</w> <w>tok2</w><a><b/></a> <w>tok3</w>' are the from-
- and to-indizes for the tags 'a' and 'b' both 12, which is the start-index of the token 'tok3'.
-
-
## Notes on whitespace fixing
The idea for the below code fragment was to fix (recreate) missing whitespace in a poorly created corpus, in which linebreaks where inserted
diff --git a/t/inline.t b/t/inline.t
index 7fa2357..d5a1db2 100644
--- a/t/inline.t
+++ b/t/inline.t
@@ -93,5 +93,161 @@
->text_is('#s2 fs f[name="type"]', 'NN')
;
+subtest 'Examples from documentation' => sub {
+ plan skip_all => 'Expected behaviour not finalized';
+
+ # From the documentation:
+ #
+ # Example:
+ # '... <head type="main"><s>Campagne in Frankreich</s></head><head type="sub"> <s>1792</s> ...'
+
+ # Two text-nodes should normally be separated by a blank.
+ # In the above example, that would be the 2 text-nodes
+ # 'Campagne in Frankreich' and '1792', which are separated
+ # by the whitespace-node ' ' (see [2]).
+ #
+ # The text-node 'Campagne in Frankreich' leads to the setting
+ # of '$add_one' to 1, so that when opening the 2nd 'head'-tag,
+ # it's from-index gets set to the correct start-index of '1792'
+ # (and not to the start-index of the whitespace-node ' ').
+ #
+ # The assumption here is, that in most cases there _is_ a
+ # whitespace node between 2 text-nodes. The below code fragment
+ # enables a way, to check, if this really _was_ the case for
+ # the last 2 'non-tag'-nodes, when closing a tag:
+ #
+ # When a whitespace-node is read, its from-index is stored
+ # as a hash-key (in %ws), to state that it belongs to a ws-node.
+ # So when closing a tag, it can be checked, if the previous
+ # 'non-tag'-node (text or whitespace), which is the one before
+ # the last read 'non-tag'-node, was a actually _not_ a ws-node,
+ # but instead a text-node. In that case, the from-value of
+ # the last read 'non-tag'-node has to be corrected (see [1]),
+ #
+ # For whitespace-nodes $add_one is set to 0, so when opening
+ # the next tag (in the above example the 2nd 's'-tag), no
+ # additional 1 is added (because this was already done by the
+ # whitespace-node itself when incrementing the variable $pos).
+ #
+ # [1]
+ # Now, what happens, when 2 text-nodes are _not_ seperated by a
+ # whitespace-node (e.g.: <w>Augen<c>,</c></w>)?
+ # In this case, the falsely increased from-value has to be
+ # decreased again by 1 when closing the enclosing tag
+ # (see above code fragment '... not exists $ws{ $from - 1 } ...').
+ #
+ # [2]
+ # Comparing the 2 examples '<w>fu</w> <w>bar</w>' and
+ # '<w>fu</w><w> </w><w>bar</w>', is ' ' in both cases handled as a
+ # whitespace-node (XML_READER_TYPE_SIGNIFICANT_WHITESPACE).
+ #
+ # The from-index of the 2nd w-tag in the second example refers to
+ # 'bar', which may not have been the intention
+ # (even though '<w> </w>' doesn't make a lot of sense).
+ # TODO: could this be a bug?
+ #
+ # Empty tags also cling to the next text-token - e.g. in
+ # '<w>tok1</w> <w>tok2</w><a><b/></a> <w>tok3</w>' are the from-
+ # and to-indizes for the tags 'a' and 'b' both 12,
+ # which is the start-index of the token 'tok3'.
+
+ ok($inline->parse(
+ 'bbb',
+ \'<head type="main"><s>Campagne in Frankreich</s></head><head type="sub"> <s>1792</s></head>'),'Parsed');
+ is($inline->data->data, 'Campagne in Frankreich 1792');
+
+ Test::XML::Loy->new($inline->structures->to_string('aaa', 2))
+ ->attr_is('#s0', 'l', "1")
+ ->attr_is('#s0', 'to', 27)
+ ->text_is('#s0 fs f[name="name"]', 'text')
+
+ ->attr_is('#s1', 'l', "2")
+ ->attr_is('#s1', 'to', 22)
+ ->text_is('#s1 fs f[name="name"]', 'head')
+ ->text_is('#s1 fs f[name="attr"] fs f[name=type]', 'main')
+
+ ->attr_is('#s2', 'l', "3")
+ ->attr_is('#s2', 'to', 22)
+ ->text_is('#s2 fs f[name="name"]', 's')
+
+ ->attr_is('#s3', 'l', "2")
+ ->attr_is('#s3', 'from', 23)
+ ->attr_is('#s3', 'to', 27)
+ ->text_is('#s3 fs f[name="name"]', 'head')
+ ->text_is('#s3 fs f[name="attr"] fs f[name=type]', 'sub')
+
+ ->attr_is('#s4', 'l', "3")
+ ->attr_is('#s4', 'from', 23)
+ ->attr_is('#s4', 'to', 27)
+ ->text_is('#s4 fs f[name="name"]', 's')
+ ;
+
+ ok($inline->parse(
+ 'ccc',
+ \'<w>tok1</w> <w>tok2</w><a><b/></a> <w>tok3</w>'
+ ), 'Parsed');
+ is($inline->data->data, 'tok1 tok2 tok3');
+
+ Test::XML::Loy->new($inline->structures->to_string('ccc', 2))
+ ->attr_is('#s0', 'l', "1")
+ ->attr_is('#s0', 'to', 14)
+ ->text_is('#s0 fs f[name="name"]', 'text')
+
+ ->attr_is('#s1', 'l', "2")
+ ->attr_is('#s1', 'to', 4)
+ ->text_is('#s1 fs f[name="name"]', 'w')
+
+ ->attr_is('#s2', 'l', "2")
+ ->attr_is('#s2', 'from', 5)
+ ->attr_is('#s2', 'to', 9)
+ ->text_is('#s2 fs f[name="name"]', 'w')
+
+ ->attr_is('#s2', 'l', "2")
+ ->attr_is('#s2', 'from', 5)
+ ->attr_is('#s2', 'to', 9)
+ ->text_is('#s2 fs f[name="name"]', 'w')
+
+ ->attr_is('#s3', 'l', "2")
+ ->attr_is('#s3', 'from', 10)
+ ->attr_is('#s3', 'to', 10)
+ ->text_is('#s3 fs f[name="name"]', 'a')
+
+ ->attr_is('#s4', 'l', "3")
+ ->attr_is('#s4', 'from', 10)
+ ->attr_is('#s4', 'to', 10)
+ ->text_is('#s4 fs f[name="name"]', 'b')
+
+ ->attr_is('#s5', 'l', "2")
+ ->attr_is('#s5', 'from', 10)
+ ->attr_is('#s5', 'to', 14)
+ ->text_is('#s5 fs f[name="name"]', 'w')
+ ;
+
+ ok($inline->parse(
+ 'ccc',
+ \'<w>Augen<c>,</c></w> <w>die</w>'
+ ), 'Parsed');
+ is($inline->data->data, 'Augen, die');
+
+ Test::XML::Loy->new($inline->structures->to_string('ddd', 2))
+ ->attr_is('#s0', 'l', "1")
+ ->attr_is('#s0', 'to', 10)
+ ->text_is('#s0 fs f[name="name"]', 'text')
+
+ ->attr_is('#s1', 'l', "2")
+ ->attr_is('#s1', 'to', 6)
+ ->text_is('#s1 fs f[name="name"]', 'w')
+
+ ->attr_is('#s2', 'l', "3")
+ ->attr_is('#s2', 'from', 5)
+ ->attr_is('#s2', 'to', 6)
+ ->text_is('#s2 fs f[name="name"]', 'c')
+
+ ->attr_is('#s3', 'l', "2")
+ ->attr_is('#s3', 'from', 7)
+ ->attr_is('#s3', 'to', 10)
+ ->text_is('#s3 fs f[name="name"]', 'w')
+ ;
+};
done_testing;