Akron | eb12e23 | 2021-02-25 13:49:50 +0100 | [diff] [blame] | 1 | use strict; |
| 2 | use warnings; |
| 3 | |
| 4 | use FindBin; |
| 5 | BEGIN { |
| 6 | unshift @INC, "$FindBin::Bin/../lib"; |
| 7 | }; |
| 8 | |
| 9 | use Test::More; |
| 10 | use Test::XML::Loy; |
| 11 | use_ok('KorAP::XML::TEI::Inline'); |
| 12 | |
| 13 | |
| 14 | my $inline = KorAP::XML::TEI::Inline->new; |
| 15 | |
| 16 | ok($inline->parse('aaa', \'Der <b>alte</b> Mann'), 'Parsed'); |
| 17 | |
| 18 | is($inline->data->data, 'Der alte Mann'); |
| 19 | |
| 20 | Test::XML::Loy->new($inline->structures->to_string('aaa', 2)) |
| 21 | ->attr_is('#s0', 'l', "1") |
| 22 | ->attr_is('#s0', 'to', 13) |
| 23 | ->text_is('#s0 fs f[name=name]', 'text') |
| 24 | ->attr_is('#s1', 'l', "2") |
| 25 | ->attr_is('#s1', 'from', 4) |
| 26 | ->attr_is('#s1', 'to', 8) |
| 27 | ->text_is('#s1 fs f[name=name]', 'b') |
| 28 | ; |
| 29 | |
| 30 | Test::XML::Loy->new($inline->tokens->to_string('aaa', 0)) |
| 31 | ->element_exists_not('fs') |
| 32 | ; |
| 33 | |
| 34 | |
| 35 | ok($inline->parse('aaa', \'<w>Die</w> <w>alte</w> <w>Frau</w>'), 'Parsed'); |
| 36 | |
| 37 | is($inline->data->data, 'Die alte Frau'); |
| 38 | |
| 39 | Test::XML::Loy->new($inline->structures->to_string('aaa', 2)) |
| 40 | ->attr_is('#s0', 'l', "1") |
| 41 | ->attr_is('#s0', 'to', 13) |
| 42 | ->text_is('#s0 fs f[name=name]', 'text') |
| 43 | |
| 44 | ->attr_is('#s1', 'l', "2") |
| 45 | ->attr_is('#s1', 'to', 3) |
| 46 | ->text_is('#s1 fs f[name=name]', 'w') |
| 47 | |
| 48 | ->attr_is('#s2', 'l', "2") |
| 49 | ->attr_is('#s2', 'from', 4) |
| 50 | ->attr_is('#s2', 'to', 8) |
| 51 | ->text_is('#s2 fs f[name=name]', 'w') |
| 52 | |
| 53 | ->attr_is('#s3', 'l', "2") |
| 54 | ->attr_is('#s3', 'from', 9) |
| 55 | ->attr_is('#s3', 'to', 13) |
| 56 | ->text_is('#s3 fs f[name=name]', 'w') |
| 57 | ; |
| 58 | |
| 59 | Test::XML::Loy->new($inline->tokens->to_string('aaa', 0)) |
| 60 | ->attr_is('#s0', 'l', "2") |
| 61 | ->attr_is('#s0', 'to', 3) |
| 62 | |
| 63 | ->attr_is('#s1', 'l', "2") |
| 64 | ->attr_is('#s1', 'from', 4) |
| 65 | ->attr_is('#s1', 'to', 8) |
| 66 | |
| 67 | ->attr_is('#s2', 'l', "2") |
| 68 | ->attr_is('#s2', 'from', 9) |
| 69 | ->attr_is('#s2', 'to', 13) |
| 70 | ; |
| 71 | |
| 72 | ok($inline->parse('aaa', \'<w lemma="die" type="det">Die</w> <w |
| 73 | lemma="alt" type="ADJ">alte</w> <w lemma="frau" type="NN">Frau</w>'), 'Parsed'); |
| 74 | |
| 75 | is($inline->data->data, 'Die alte Frau'); |
| 76 | |
| 77 | Test::XML::Loy->new($inline->tokens->to_string('aaa', 1)) |
| 78 | ->attr_is('#s0', 'l', "2") |
| 79 | ->attr_is('#s0', 'to', 3) |
| 80 | ->text_is('#s0 fs f[name="lemma"]', 'die') |
| 81 | ->text_is('#s0 fs f[name="type"]', 'det') |
| 82 | |
| 83 | ->attr_is('#s1', 'l', "2") |
| 84 | ->attr_is('#s1', 'from', 4) |
| 85 | ->attr_is('#s1', 'to', 8) |
| 86 | ->text_is('#s1 fs f[name="lemma"]', 'alt') |
| 87 | ->text_is('#s1 fs f[name="type"]', 'ADJ') |
| 88 | |
| 89 | ->attr_is('#s2', 'l', "2") |
| 90 | ->attr_is('#s2', 'from', 9) |
| 91 | ->attr_is('#s2', 'to', 13) |
| 92 | ->text_is('#s2 fs f[name="lemma"]', 'frau') |
| 93 | ->text_is('#s2 fs f[name="type"]', 'NN') |
| 94 | ; |
| 95 | |
Akron | 56b8dbd | 2021-02-26 11:23:48 +0100 | [diff] [blame] | 96 | subtest 'Examples from documentation' => sub { |
| 97 | plan skip_all => 'Expected behaviour not finalized'; |
| 98 | |
| 99 | # From the documentation: |
| 100 | # |
| 101 | # Example: |
| 102 | # '... <head type="main"><s>Campagne in Frankreich</s></head><head type="sub"> <s>1792</s> ...' |
| 103 | |
| 104 | # Two text-nodes should normally be separated by a blank. |
| 105 | # In the above example, that would be the 2 text-nodes |
| 106 | # 'Campagne in Frankreich' and '1792', which are separated |
| 107 | # by the whitespace-node ' ' (see [2]). |
| 108 | # |
| 109 | # The text-node 'Campagne in Frankreich' leads to the setting |
| 110 | # of '$add_one' to 1, so that when opening the 2nd 'head'-tag, |
| 111 | # it's from-index gets set to the correct start-index of '1792' |
| 112 | # (and not to the start-index of the whitespace-node ' '). |
| 113 | # |
| 114 | # The assumption here is, that in most cases there _is_ a |
| 115 | # whitespace node between 2 text-nodes. The below code fragment |
| 116 | # enables a way, to check, if this really _was_ the case for |
| 117 | # the last 2 'non-tag'-nodes, when closing a tag: |
| 118 | # |
| 119 | # When a whitespace-node is read, its from-index is stored |
| 120 | # as a hash-key (in %ws), to state that it belongs to a ws-node. |
| 121 | # So when closing a tag, it can be checked, if the previous |
| 122 | # 'non-tag'-node (text or whitespace), which is the one before |
| 123 | # the last read 'non-tag'-node, was a actually _not_ a ws-node, |
| 124 | # but instead a text-node. In that case, the from-value of |
| 125 | # the last read 'non-tag'-node has to be corrected (see [1]), |
| 126 | # |
| 127 | # For whitespace-nodes $add_one is set to 0, so when opening |
| 128 | # the next tag (in the above example the 2nd 's'-tag), no |
| 129 | # additional 1 is added (because this was already done by the |
| 130 | # whitespace-node itself when incrementing the variable $pos). |
| 131 | # |
| 132 | # [1] |
| 133 | # Now, what happens, when 2 text-nodes are _not_ seperated by a |
| 134 | # whitespace-node (e.g.: <w>Augen<c>,</c></w>)? |
| 135 | # In this case, the falsely increased from-value has to be |
| 136 | # decreased again by 1 when closing the enclosing tag |
| 137 | # (see above code fragment '... not exists $ws{ $from - 1 } ...'). |
| 138 | # |
| 139 | # [2] |
| 140 | # Comparing the 2 examples '<w>fu</w> <w>bar</w>' and |
| 141 | # '<w>fu</w><w> </w><w>bar</w>', is ' ' in both cases handled as a |
| 142 | # whitespace-node (XML_READER_TYPE_SIGNIFICANT_WHITESPACE). |
| 143 | # |
| 144 | # The from-index of the 2nd w-tag in the second example refers to |
| 145 | # 'bar', which may not have been the intention |
| 146 | # (even though '<w> </w>' doesn't make a lot of sense). |
| 147 | # TODO: could this be a bug? |
| 148 | # |
| 149 | # Empty tags also cling to the next text-token - e.g. in |
| 150 | # '<w>tok1</w> <w>tok2</w><a><b/></a> <w>tok3</w>' are the from- |
| 151 | # and to-indizes for the tags 'a' and 'b' both 12, |
| 152 | # which is the start-index of the token 'tok3'. |
| 153 | |
| 154 | ok($inline->parse( |
| 155 | 'bbb', |
| 156 | \'<head type="main"><s>Campagne in Frankreich</s></head><head type="sub"> <s>1792</s></head>'),'Parsed'); |
| 157 | is($inline->data->data, 'Campagne in Frankreich 1792'); |
| 158 | |
| 159 | Test::XML::Loy->new($inline->structures->to_string('aaa', 2)) |
| 160 | ->attr_is('#s0', 'l', "1") |
| 161 | ->attr_is('#s0', 'to', 27) |
| 162 | ->text_is('#s0 fs f[name="name"]', 'text') |
| 163 | |
| 164 | ->attr_is('#s1', 'l', "2") |
| 165 | ->attr_is('#s1', 'to', 22) |
| 166 | ->text_is('#s1 fs f[name="name"]', 'head') |
| 167 | ->text_is('#s1 fs f[name="attr"] fs f[name=type]', 'main') |
| 168 | |
| 169 | ->attr_is('#s2', 'l', "3") |
| 170 | ->attr_is('#s2', 'to', 22) |
| 171 | ->text_is('#s2 fs f[name="name"]', 's') |
| 172 | |
| 173 | ->attr_is('#s3', 'l', "2") |
| 174 | ->attr_is('#s3', 'from', 23) |
| 175 | ->attr_is('#s3', 'to', 27) |
| 176 | ->text_is('#s3 fs f[name="name"]', 'head') |
| 177 | ->text_is('#s3 fs f[name="attr"] fs f[name=type]', 'sub') |
| 178 | |
| 179 | ->attr_is('#s4', 'l', "3") |
| 180 | ->attr_is('#s4', 'from', 23) |
| 181 | ->attr_is('#s4', 'to', 27) |
| 182 | ->text_is('#s4 fs f[name="name"]', 's') |
| 183 | ; |
| 184 | |
| 185 | ok($inline->parse( |
| 186 | 'ccc', |
| 187 | \'<w>tok1</w> <w>tok2</w><a><b/></a> <w>tok3</w>' |
| 188 | ), 'Parsed'); |
| 189 | is($inline->data->data, 'tok1 tok2 tok3'); |
| 190 | |
| 191 | Test::XML::Loy->new($inline->structures->to_string('ccc', 2)) |
| 192 | ->attr_is('#s0', 'l', "1") |
| 193 | ->attr_is('#s0', 'to', 14) |
| 194 | ->text_is('#s0 fs f[name="name"]', 'text') |
| 195 | |
| 196 | ->attr_is('#s1', 'l', "2") |
| 197 | ->attr_is('#s1', 'to', 4) |
| 198 | ->text_is('#s1 fs f[name="name"]', 'w') |
| 199 | |
| 200 | ->attr_is('#s2', 'l', "2") |
| 201 | ->attr_is('#s2', 'from', 5) |
| 202 | ->attr_is('#s2', 'to', 9) |
| 203 | ->text_is('#s2 fs f[name="name"]', 'w') |
| 204 | |
| 205 | ->attr_is('#s2', 'l', "2") |
| 206 | ->attr_is('#s2', 'from', 5) |
| 207 | ->attr_is('#s2', 'to', 9) |
| 208 | ->text_is('#s2 fs f[name="name"]', 'w') |
| 209 | |
| 210 | ->attr_is('#s3', 'l', "2") |
| 211 | ->attr_is('#s3', 'from', 10) |
| 212 | ->attr_is('#s3', 'to', 10) |
| 213 | ->text_is('#s3 fs f[name="name"]', 'a') |
| 214 | |
| 215 | ->attr_is('#s4', 'l', "3") |
| 216 | ->attr_is('#s4', 'from', 10) |
| 217 | ->attr_is('#s4', 'to', 10) |
| 218 | ->text_is('#s4 fs f[name="name"]', 'b') |
| 219 | |
| 220 | ->attr_is('#s5', 'l', "2") |
| 221 | ->attr_is('#s5', 'from', 10) |
| 222 | ->attr_is('#s5', 'to', 14) |
| 223 | ->text_is('#s5 fs f[name="name"]', 'w') |
| 224 | ; |
| 225 | |
| 226 | ok($inline->parse( |
| 227 | 'ccc', |
| 228 | \'<w>Augen<c>,</c></w> <w>die</w>' |
| 229 | ), 'Parsed'); |
| 230 | is($inline->data->data, 'Augen, die'); |
| 231 | |
| 232 | Test::XML::Loy->new($inline->structures->to_string('ddd', 2)) |
| 233 | ->attr_is('#s0', 'l', "1") |
| 234 | ->attr_is('#s0', 'to', 10) |
| 235 | ->text_is('#s0 fs f[name="name"]', 'text') |
| 236 | |
| 237 | ->attr_is('#s1', 'l', "2") |
| 238 | ->attr_is('#s1', 'to', 6) |
| 239 | ->text_is('#s1 fs f[name="name"]', 'w') |
| 240 | |
| 241 | ->attr_is('#s2', 'l', "3") |
| 242 | ->attr_is('#s2', 'from', 5) |
| 243 | ->attr_is('#s2', 'to', 6) |
| 244 | ->text_is('#s2 fs f[name="name"]', 'c') |
| 245 | |
| 246 | ->attr_is('#s3', 'l', "2") |
| 247 | ->attr_is('#s3', 'from', 7) |
| 248 | ->attr_is('#s3', 'to', 10) |
| 249 | ->text_is('#s3 fs f[name="name"]', 'w') |
| 250 | ; |
| 251 | }; |
Akron | eb12e23 | 2021-02-25 13:49:50 +0100 | [diff] [blame] | 252 | |
Akron | e2819a1 | 2021-10-12 15:52:55 +0200 | [diff] [blame] | 253 | |
| 254 | subtest 'Treatment of tokens' => sub { |
| 255 | my $inline = KorAP::XML::TEI::Inline->new(0, {b => 1}, 1); |
| 256 | |
| 257 | ok($inline->parse('aaa', \'<a>Der</a> <b>alte</b> <w pos="NN">Baum</w>'), 'Parsed'); |
| 258 | is($inline->data->data, 'Der alte Baum'); |
| 259 | |
| 260 | # Only contains '<a>' |
| 261 | Test::XML::Loy->new($inline->structures->to_string('aaa', 1)) |
| 262 | ->attr_is('#s1', 'to', 3) |
| 263 | ->element_exists_not('#s2') |
| 264 | ; |
| 265 | |
| 266 | # Only contains 'w' |
| 267 | Test::XML::Loy->new($inline->tokens->to_string('aaa', 1)) |
| 268 | ->attr_is('#s0', 'from', 9) |
| 269 | ->attr_is('#s0', 'to', 13) |
| 270 | ->attr_is('#s0 > fs > f > fs > f', 'name', 'pos') |
| 271 | ->text_is('#s0 > fs > f > fs > f[name=pos]', 'NN') |
| 272 | ->element_exists_not('#s1') |
| 273 | ; |
| 274 | }; |
| 275 | |
Akron | eb12e23 | 2021-02-25 13:49:50 +0100 | [diff] [blame] | 276 | done_testing; |