| use strict; |
| use warnings; |
| |
| use FindBin; |
| BEGIN { |
| unshift @INC, "$FindBin::Bin/../lib"; |
| }; |
| |
| use Test::More; |
| use Test::XML::Loy; |
| use_ok('KorAP::XML::TEI::Inline'); |
| |
| |
| my $inline = KorAP::XML::TEI::Inline->new; |
| |
| ok($inline->parse('aaa', \'Der <b>alte</b> Mann'), 'Parsed'); |
| |
| is($inline->data->data, 'Der alte Mann'); |
| |
| Test::XML::Loy->new($inline->structures->to_string('aaa', 2)) |
| ->attr_is('#s0', 'l', "1") |
| ->attr_is('#s0', 'to', 13) |
| ->text_is('#s0 fs f[name=name]', 'text') |
| ->attr_is('#s1', 'l', "2") |
| ->attr_is('#s1', 'from', 4) |
| ->attr_is('#s1', 'to', 8) |
| ->text_is('#s1 fs f[name=name]', 'b') |
| ; |
| |
| Test::XML::Loy->new($inline->tokens->to_string('aaa', 0)) |
| ->element_exists_not('fs') |
| ; |
| |
| |
| ok($inline->parse('aaa', \'<w>Die</w> <w>alte</w> <w>Frau</w>'), 'Parsed'); |
| |
| is($inline->data->data, 'Die alte Frau'); |
| |
| Test::XML::Loy->new($inline->structures->to_string('aaa', 2)) |
| ->attr_is('#s0', 'l', "1") |
| ->attr_is('#s0', 'to', 13) |
| ->text_is('#s0 fs f[name=name]', 'text') |
| |
| ->attr_is('#s1', 'l', "2") |
| ->attr_is('#s1', 'to', 3) |
| ->text_is('#s1 fs f[name=name]', 'w') |
| |
| ->attr_is('#s2', 'l', "2") |
| ->attr_is('#s2', 'from', 4) |
| ->attr_is('#s2', 'to', 8) |
| ->text_is('#s2 fs f[name=name]', 'w') |
| |
| ->attr_is('#s3', 'l', "2") |
| ->attr_is('#s3', 'from', 9) |
| ->attr_is('#s3', 'to', 13) |
| ->text_is('#s3 fs f[name=name]', 'w') |
| ; |
| |
| Test::XML::Loy->new($inline->tokens->to_string('aaa', 0)) |
| ->attr_is('#s0', 'l', "2") |
| ->attr_is('#s0', 'to', 3) |
| |
| ->attr_is('#s1', 'l', "2") |
| ->attr_is('#s1', 'from', 4) |
| ->attr_is('#s1', 'to', 8) |
| |
| ->attr_is('#s2', 'l', "2") |
| ->attr_is('#s2', 'from', 9) |
| ->attr_is('#s2', 'to', 13) |
| ; |
| |
| ok($inline->parse('aaa', \'<w lemma="die" type="det">Die</w> <w |
| lemma="alt" type="ADJ">alte</w> <w lemma="frau" type="NN">Frau</w>'), 'Parsed'); |
| |
| is($inline->data->data, 'Die alte Frau'); |
| |
| Test::XML::Loy->new($inline->tokens->to_string('aaa', 1)) |
| ->attr_is('#s0', 'l', "2") |
| ->attr_is('#s0', 'to', 3) |
| ->text_is('#s0 fs f[name="lemma"]', 'die') |
| ->text_is('#s0 fs f[name="type"]', 'det') |
| |
| ->attr_is('#s1', 'l', "2") |
| ->attr_is('#s1', 'from', 4) |
| ->attr_is('#s1', 'to', 8) |
| ->text_is('#s1 fs f[name="lemma"]', 'alt') |
| ->text_is('#s1 fs f[name="type"]', 'ADJ') |
| |
| ->attr_is('#s2', 'l', "2") |
| ->attr_is('#s2', 'from', 9) |
| ->attr_is('#s2', 'to', 13) |
| ->text_is('#s2 fs f[name="lemma"]', 'frau') |
| ->text_is('#s2 fs f[name="type"]', 'NN') |
| ; |
| |
| subtest 'Examples from documentation' => sub { |
| plan skip_all => 'Expected behaviour not finalized'; |
| |
| # From the documentation: |
| # |
| # Example: |
| # '... <head type="main"><s>Campagne in Frankreich</s></head><head type="sub"> <s>1792</s> ...' |
| |
| # Two text-nodes should normally be separated by a blank. |
| # In the above example, that would be the 2 text-nodes |
| # 'Campagne in Frankreich' and '1792', which are separated |
| # by the whitespace-node ' ' (see [2]). |
| # |
| # The text-node 'Campagne in Frankreich' leads to the setting |
| # of '$add_one' to 1, so that when opening the 2nd 'head'-tag, |
| # it's from-index gets set to the correct start-index of '1792' |
| # (and not to the start-index of the whitespace-node ' '). |
| # |
| # The assumption here is, that in most cases there _is_ a |
| # whitespace node between 2 text-nodes. The below code fragment |
| # enables a way, to check, if this really _was_ the case for |
| # the last 2 'non-tag'-nodes, when closing a tag: |
| # |
| # When a whitespace-node is read, its from-index is stored |
| # as a hash-key (in %ws), to state that it belongs to a ws-node. |
| # So when closing a tag, it can be checked, if the previous |
| # 'non-tag'-node (text or whitespace), which is the one before |
| # the last read 'non-tag'-node, was a actually _not_ a ws-node, |
| # but instead a text-node. In that case, the from-value of |
| # the last read 'non-tag'-node has to be corrected (see [1]), |
| # |
| # For whitespace-nodes $add_one is set to 0, so when opening |
| # the next tag (in the above example the 2nd 's'-tag), no |
| # additional 1 is added (because this was already done by the |
| # whitespace-node itself when incrementing the variable $pos). |
| # |
| # [1] |
| # Now, what happens, when 2 text-nodes are _not_ seperated by a |
| # whitespace-node (e.g.: <w>Augen<c>,</c></w>)? |
| # In this case, the falsely increased from-value has to be |
| # decreased again by 1 when closing the enclosing tag |
| # (see above code fragment '... not exists $ws{ $from - 1 } ...'). |
| # |
| # [2] |
| # Comparing the 2 examples '<w>fu</w> <w>bar</w>' and |
| # '<w>fu</w><w> </w><w>bar</w>', is ' ' in both cases handled as a |
| # whitespace-node (XML_READER_TYPE_SIGNIFICANT_WHITESPACE). |
| # |
| # The from-index of the 2nd w-tag in the second example refers to |
| # 'bar', which may not have been the intention |
| # (even though '<w> </w>' doesn't make a lot of sense). |
| # TODO: could this be a bug? |
| # |
| # Empty tags also cling to the next text-token - e.g. in |
| # '<w>tok1</w> <w>tok2</w><a><b/></a> <w>tok3</w>' are the from- |
| # and to-indizes for the tags 'a' and 'b' both 12, |
| # which is the start-index of the token 'tok3'. |
| |
| ok($inline->parse( |
| 'bbb', |
| \'<head type="main"><s>Campagne in Frankreich</s></head><head type="sub"> <s>1792</s></head>'),'Parsed'); |
| is($inline->data->data, 'Campagne in Frankreich 1792'); |
| |
| Test::XML::Loy->new($inline->structures->to_string('aaa', 2)) |
| ->attr_is('#s0', 'l', "1") |
| ->attr_is('#s0', 'to', 27) |
| ->text_is('#s0 fs f[name="name"]', 'text') |
| |
| ->attr_is('#s1', 'l', "2") |
| ->attr_is('#s1', 'to', 22) |
| ->text_is('#s1 fs f[name="name"]', 'head') |
| ->text_is('#s1 fs f[name="attr"] fs f[name=type]', 'main') |
| |
| ->attr_is('#s2', 'l', "3") |
| ->attr_is('#s2', 'to', 22) |
| ->text_is('#s2 fs f[name="name"]', 's') |
| |
| ->attr_is('#s3', 'l', "2") |
| ->attr_is('#s3', 'from', 23) |
| ->attr_is('#s3', 'to', 27) |
| ->text_is('#s3 fs f[name="name"]', 'head') |
| ->text_is('#s3 fs f[name="attr"] fs f[name=type]', 'sub') |
| |
| ->attr_is('#s4', 'l', "3") |
| ->attr_is('#s4', 'from', 23) |
| ->attr_is('#s4', 'to', 27) |
| ->text_is('#s4 fs f[name="name"]', 's') |
| ; |
| |
| ok($inline->parse( |
| 'ccc', |
| \'<w>tok1</w> <w>tok2</w><a><b/></a> <w>tok3</w>' |
| ), 'Parsed'); |
| is($inline->data->data, 'tok1 tok2 tok3'); |
| |
| Test::XML::Loy->new($inline->structures->to_string('ccc', 2)) |
| ->attr_is('#s0', 'l', "1") |
| ->attr_is('#s0', 'to', 14) |
| ->text_is('#s0 fs f[name="name"]', 'text') |
| |
| ->attr_is('#s1', 'l', "2") |
| ->attr_is('#s1', 'to', 4) |
| ->text_is('#s1 fs f[name="name"]', 'w') |
| |
| ->attr_is('#s2', 'l', "2") |
| ->attr_is('#s2', 'from', 5) |
| ->attr_is('#s2', 'to', 9) |
| ->text_is('#s2 fs f[name="name"]', 'w') |
| |
| ->attr_is('#s2', 'l', "2") |
| ->attr_is('#s2', 'from', 5) |
| ->attr_is('#s2', 'to', 9) |
| ->text_is('#s2 fs f[name="name"]', 'w') |
| |
| ->attr_is('#s3', 'l', "2") |
| ->attr_is('#s3', 'from', 10) |
| ->attr_is('#s3', 'to', 10) |
| ->text_is('#s3 fs f[name="name"]', 'a') |
| |
| ->attr_is('#s4', 'l', "3") |
| ->attr_is('#s4', 'from', 10) |
| ->attr_is('#s4', 'to', 10) |
| ->text_is('#s4 fs f[name="name"]', 'b') |
| |
| ->attr_is('#s5', 'l', "2") |
| ->attr_is('#s5', 'from', 10) |
| ->attr_is('#s5', 'to', 14) |
| ->text_is('#s5 fs f[name="name"]', 'w') |
| ; |
| |
| ok($inline->parse( |
| 'ccc', |
| \'<w>Augen<c>,</c></w> <w>die</w>' |
| ), 'Parsed'); |
| is($inline->data->data, 'Augen, die'); |
| |
| Test::XML::Loy->new($inline->structures->to_string('ddd', 2)) |
| ->attr_is('#s0', 'l', "1") |
| ->attr_is('#s0', 'to', 10) |
| ->text_is('#s0 fs f[name="name"]', 'text') |
| |
| ->attr_is('#s1', 'l', "2") |
| ->attr_is('#s1', 'to', 6) |
| ->text_is('#s1 fs f[name="name"]', 'w') |
| |
| ->attr_is('#s2', 'l', "3") |
| ->attr_is('#s2', 'from', 5) |
| ->attr_is('#s2', 'to', 6) |
| ->text_is('#s2 fs f[name="name"]', 'c') |
| |
| ->attr_is('#s3', 'l', "2") |
| ->attr_is('#s3', 'from', 7) |
| ->attr_is('#s3', 'to', 10) |
| ->text_is('#s3 fs f[name="name"]', 'w') |
| ; |
| }; |
| |
| |
| subtest 'Treatment of tokens' => sub { |
| my $inline = KorAP::XML::TEI::Inline->new(0, {b => 1}, 1); |
| |
| ok($inline->parse('aaa', \'<a>Der</a> <b>alte</b> <w pos="NN">Baum</w>'), 'Parsed'); |
| is($inline->data->data, 'Der alte Baum'); |
| |
| # Only contains '<a>' |
| Test::XML::Loy->new($inline->structures->to_string('aaa', 1)) |
| ->attr_is('#s1', 'to', 3) |
| ->element_exists_not('#s2') |
| ; |
| |
| # Only contains 'w' |
| Test::XML::Loy->new($inline->tokens->to_string('aaa', 1)) |
| ->attr_is('#s0', 'from', 9) |
| ->attr_is('#s0', 'to', 13) |
| ->attr_is('#s0 > fs > f > fs > f', 'name', 'pos') |
| ->text_is('#s0 > fs > f > fs > f[name=pos]', 'NN') |
| ->element_exists_not('#s1') |
| ; |
| }; |
| |
| done_testing; |