Akron | eb12e23 | 2021-02-25 13:49:50 +0100 | [diff] [blame] | 1 | use strict; |
| 2 | use warnings; |
| 3 | |
| 4 | use FindBin; |
| 5 | BEGIN { |
| 6 | unshift @INC, "$FindBin::Bin/../lib"; |
| 7 | }; |
| 8 | |
| 9 | use Test::More; |
| 10 | use Test::XML::Loy; |
| 11 | use_ok('KorAP::XML::TEI::Inline'); |
| 12 | |
| 13 | |
| 14 | my $inline = KorAP::XML::TEI::Inline->new; |
| 15 | |
| 16 | ok($inline->parse('aaa', \'Der <b>alte</b> Mann'), 'Parsed'); |
| 17 | |
| 18 | is($inline->data->data, 'Der alte Mann'); |
| 19 | |
| 20 | Test::XML::Loy->new($inline->structures->to_string('aaa', 2)) |
| 21 | ->attr_is('#s0', 'l', "1") |
| 22 | ->attr_is('#s0', 'to', 13) |
| 23 | ->text_is('#s0 fs f[name=name]', 'text') |
| 24 | ->attr_is('#s1', 'l', "2") |
| 25 | ->attr_is('#s1', 'from', 4) |
| 26 | ->attr_is('#s1', 'to', 8) |
| 27 | ->text_is('#s1 fs f[name=name]', 'b') |
| 28 | ; |
| 29 | |
| 30 | Test::XML::Loy->new($inline->tokens->to_string('aaa', 0)) |
| 31 | ->element_exists_not('fs') |
| 32 | ; |
| 33 | |
| 34 | |
| 35 | ok($inline->parse('aaa', \'<w>Die</w> <w>alte</w> <w>Frau</w>'), 'Parsed'); |
| 36 | |
| 37 | is($inline->data->data, 'Die alte Frau'); |
| 38 | |
| 39 | Test::XML::Loy->new($inline->structures->to_string('aaa', 2)) |
| 40 | ->attr_is('#s0', 'l', "1") |
| 41 | ->attr_is('#s0', 'to', 13) |
| 42 | ->text_is('#s0 fs f[name=name]', 'text') |
| 43 | |
| 44 | ->attr_is('#s1', 'l', "2") |
| 45 | ->attr_is('#s1', 'to', 3) |
| 46 | ->text_is('#s1 fs f[name=name]', 'w') |
| 47 | |
| 48 | ->attr_is('#s2', 'l', "2") |
| 49 | ->attr_is('#s2', 'from', 4) |
| 50 | ->attr_is('#s2', 'to', 8) |
| 51 | ->text_is('#s2 fs f[name=name]', 'w') |
| 52 | |
| 53 | ->attr_is('#s3', 'l', "2") |
| 54 | ->attr_is('#s3', 'from', 9) |
| 55 | ->attr_is('#s3', 'to', 13) |
| 56 | ->text_is('#s3 fs f[name=name]', 'w') |
| 57 | ; |
| 58 | |
| 59 | Test::XML::Loy->new($inline->tokens->to_string('aaa', 0)) |
| 60 | ->attr_is('#s0', 'l', "2") |
| 61 | ->attr_is('#s0', 'to', 3) |
| 62 | |
| 63 | ->attr_is('#s1', 'l', "2") |
| 64 | ->attr_is('#s1', 'from', 4) |
| 65 | ->attr_is('#s1', 'to', 8) |
| 66 | |
| 67 | ->attr_is('#s2', 'l', "2") |
| 68 | ->attr_is('#s2', 'from', 9) |
| 69 | ->attr_is('#s2', 'to', 13) |
| 70 | ; |
| 71 | |
| 72 | ok($inline->parse('aaa', \'<w lemma="die" type="det">Die</w> <w |
| 73 | lemma="alt" type="ADJ">alte</w> <w lemma="frau" type="NN">Frau</w>'), 'Parsed'); |
| 74 | |
| 75 | is($inline->data->data, 'Die alte Frau'); |
| 76 | |
| 77 | Test::XML::Loy->new($inline->tokens->to_string('aaa', 1)) |
| 78 | ->attr_is('#s0', 'l', "2") |
| 79 | ->attr_is('#s0', 'to', 3) |
| 80 | ->text_is('#s0 fs f[name="lemma"]', 'die') |
| 81 | ->text_is('#s0 fs f[name="type"]', 'det') |
| 82 | |
| 83 | ->attr_is('#s1', 'l', "2") |
| 84 | ->attr_is('#s1', 'from', 4) |
| 85 | ->attr_is('#s1', 'to', 8) |
| 86 | ->text_is('#s1 fs f[name="lemma"]', 'alt') |
| 87 | ->text_is('#s1 fs f[name="type"]', 'ADJ') |
| 88 | |
| 89 | ->attr_is('#s2', 'l', "2") |
| 90 | ->attr_is('#s2', 'from', 9) |
| 91 | ->attr_is('#s2', 'to', 13) |
| 92 | ->text_is('#s2 fs f[name="lemma"]', 'frau') |
| 93 | ->text_is('#s2 fs f[name="type"]', 'NN') |
| 94 | ; |
| 95 | |
Akron | 6b1f26b | 2024-09-19 11:35:32 +0200 | [diff] [blame] | 96 | |
| 97 | subtest 'Support dependency parsing' => sub { |
| 98 | $inline = KorAP::XML::TEI::Inline->new(0,{},0,1); |
| 99 | ok($inline->parse('Fake News Media', |
| 100 | \'<s><w n="1" lemma="Fake" pos="N" head="2" deprel="name" msd="SUBCAT_Prop|CASECHANGE_Up|OTHER_UNK">Fake</w> <w n="2" lemma="News" pos="N" head="3" deprel="name" msd="SUBCAT_Prop|CASECHANGE_Up|OTHER_UNK">News</w> <w n="3" lemma="media" pos="N" head="0" deprel="ROOT" msd="NUM_Sg|CASE_Nom|CASECHANGE_Up">Media</w></s> ' |
| 101 | ), 'Parsed'); |
| 102 | |
| 103 | is($inline->data->data, 'Fake News Media '); |
| 104 | |
| 105 | Test::XML::Loy->new($inline->tokens->to_string('aaa', 1)) |
| 106 | ->attr_is('#s0', 'l', "3") |
| 107 | ->attr_is('#s0', 'to', 4) |
| 108 | ->text_is('#s0 fs f[name="lemma"]', 'Fake') |
| 109 | ->text_is('#s0 fs f[name="pos"]', 'N') |
| 110 | ->text_is('#s0 fs f[name="n"]','1') |
| 111 | |
| 112 | ->attr_is('#s1', 'l', "3") |
| 113 | ->attr_is('#s1', 'from', 5) |
| 114 | ->attr_is('#s1', 'to', 9) |
| 115 | ->text_is('#s1 fs f[name="lemma"]', 'News') |
| 116 | ->text_is('#s1 fs f[name="pos"]', 'N') |
| 117 | ->text_is('#s1 fs f[name="n"]','2') |
| 118 | |
| 119 | ->attr_is('#s2', 'l', "3") |
| 120 | ->attr_is('#s2', 'from', 10) |
| 121 | ->attr_is('#s2', 'to', 15) |
| 122 | ->text_is('#s2 fs f[name="lemma"]', 'media') |
| 123 | ->text_is('#s2 fs f[name="pos"]', 'N') |
| 124 | ->text_is('#s2 fs f[name="n"]','3') |
| 125 | ; |
| 126 | |
| 127 | Test::XML::Loy->new($inline->tokens->to_string('aaa', 4)) |
| 128 | ->attr_is('#s0', 'l', "3") |
| 129 | ->attr_is('#s0', 'to', 4) |
| 130 | ->text_is('#s0 fs f[name="lemma"]', 'Fake') |
| 131 | ->text_is('#s0 fs f[name="pos"]', 'N') |
| 132 | ->element_exists_not('#s0 fs f[name="n"]') |
| 133 | |
| 134 | ->attr_is('#s1', 'l', "3") |
| 135 | ->attr_is('#s1', 'from', 5) |
| 136 | ->attr_is('#s1', 'to', 9) |
| 137 | ->text_is('#s1 fs f[name="lemma"]', 'News') |
| 138 | ->text_is('#s1 fs f[name="pos"]', 'N') |
| 139 | |
| 140 | ->attr_is('#s2', 'l', "3") |
| 141 | ->attr_is('#s2', 'from', 10) |
| 142 | ->attr_is('#s2', 'to', 15) |
| 143 | ->text_is('#s2 fs f[name="lemma"]', 'media') |
| 144 | ->text_is('#s2 fs f[name="pos"]', 'N') |
| 145 | ; |
| 146 | |
| 147 | Test::XML::Loy->new($inline->dependencies->to_string('aaa', 3)) |
| 148 | ->attr_is('#s1_n1', 'l', "3") |
| 149 | ->element_exists('#s1_n1[from="0"]') |
| 150 | ->attr_is('#s1_n1', 'to', 4) |
| 151 | ->attr_is('#s1_n1 rel', 'label', 'name') |
| 152 | ->attr_is('#s1_n1 rel span', 'from', 5) |
| 153 | ->attr_is('#s1_n1 rel span', 'to', 9) |
| 154 | ->element_exists_not('#s1_n1 fs') |
| 155 | |
| 156 | ->attr_is('#s1_n2', 'l', "3") |
| 157 | ->attr_is('#s1_n2', 'from', 5) |
| 158 | ->attr_is('#s1_n2', 'to', 9) |
| 159 | ->attr_is('#s1_n2 rel', 'label', 'name') |
| 160 | ->attr_is('#s1_n2 rel span', 'from', 10) |
| 161 | ->attr_is('#s1_n2 rel span', 'to', 15) |
| 162 | |
| 163 | ->attr_is('#s1_n3', 'l', "3") |
| 164 | ->attr_is('#s1_n3', 'from', 10) |
| 165 | ->attr_is('#s1_n3', 'to', 15) |
| 166 | ->attr_is('#s1_n3 rel', 'label', 'ROOT') |
| 167 | ->element_exists('#s1_n3 rel span[from="0"]') |
| 168 | ->attr_is('#s1_n3 rel span', 'to', 15) |
| 169 | ; |
| 170 | |
| 171 | $inline = KorAP::XML::TEI::Inline->new(0,{},0,1); |
| 172 | ok($inline->parse('Fake News Media', |
| 173 | \('<p xml:lang="x-|fin:2|"><s xml:lang="fin">'. |
| 174 | '<w deprel="nn" head="2" lemma="lJgkPOGUBSFSRQlx" msd="NUM_Sg|CASE_Nom|CASECHANGE_Up" n="1" pos="N">lJgkPOGUBSFSRQlx</w> '. |
| 175 | '<w deprel="nsubj" head="3" lemma="rYuqciR" msd="SUBCAT_Prop|NUM_Sg|CASE_Nom|CASECHANGE_Up|OTHER_UNK" n="2" pos="N">rYuqciR</w> '. |
| 176 | '<w deprel="ROOT" head="0" lemma="RcidTBqv" msd="PRS_Sg3|VOICE_Act|TENSE_Prt|MOOD_Ind" n="3" pos="V">RcidTBqv</w> '. |
| 177 | '<w deprel="poss" head="5" lemma="cHIf" msd="SUBCAT_Acro|NUM_Sg|CASE_Nom|CASECHANGE_Up" n="4" pos="N">cHIf</w> '. |
| 178 | '<w deprel="nommod" head="3" lemma="reuvyWZtUhN" msd="NUM_Sg|CASE_Ela" n="5" pos="N">reuvyWZtUhN</w> '. |
| 179 | '<w deprel="nsubj" head="7" lemma="KsaXYaFo" msd="NUM_Sg|CASE_Gen" n="6" pos="N">KsaXYaFo</w> '. |
| 180 | '<w deprel="iccomp" head="3" lemma="qJhgSDNOYpWg" msd="NUM_Sg|CASE_Ill|VOICE_Act|INF_Inf3" n="7" pos="V">qJhgSDNOYpWg</w> '. |
| 181 | '<w deprel="name" head="9" lemma="xtRyGN" msd="SUBCAT_Prop|CASECHANGE_Up|OTHER_UNK" n="8" pos="N">xtRyGN</w> '. |
| 182 | '<w deprel="poss" head="10" lemma="XCVuQwU" msd="SUBCAT_Prop|NUM_Sg|CASE_Gen|CASECHANGE_Up|OTHER_UNK" n="9" pos="N">XCVuQwU</w> '. |
| 183 | '<w deprel="poss" head="11" lemma="hYwEsYDUbYHmJ" msd="NUM_Sg|CASE_Gen|CASECHANGE_Up|OTHER_UNK" n="10" pos="N">hYwEsYDUbYHmJ</w> '. |
| 184 | '<w deprel="dobj" head="7" lemma="yYXOYOqX" msd="NUM_Sg|CASE_Gen" n="11" pos="N">yYXOYOqX</w> '. |
| 185 | '<w deprel="nommod" head="7" lemma="LkrLYiYgRSC" msd="NUM_Sg|CASE_Ade" n="12" pos="N">LkrLYiYgRSC</w> '. |
| 186 | '<w deprel="num" head="12" lemma="erRenLjillGtDCaRLIx" msd="_" n="13" pos="Num">erRenLjillGtDCaRLIx</w> '. |
| 187 | '<w deprel="punct" head="3" lemma="c" msd="_" n="14" pos="Punct">c</w> '. |
| 188 | '</s>'."\n". |
| 189 | '<s xml:lang="fin">'. |
| 190 | '<w deprel="nommod" head="3" lemma="LSymCdojKTj" msd="SUBCAT_Prop|NUM_Sg|CASE_Ine|CASECHANGE_Up|OTHER_UNK" n="1" pos="N">LSymCdojKTj</w> '. |
| 191 | '<w deprel="auxpass" head="3" lemma="vQ" msd="PRS_Sg3|VOICE_Act|TENSE_Prs|MOOD_Ind" n="2" pos="V">vQ</w> '. |
| 192 | '<w deprel="ROOT" head="0" lemma="nHfBTtne" msd="NUM_Sg|CASE_Nom|VOICE_Pass|PCP_PrfPrc|CMP_Pos" n="3" pos="V">nHfBTtne</w> '. |
| 193 | '<w deprel="preconj" head="6" lemma="fmcz" msd="SUBCAT_CC" n="4" pos="C">fmcz</w> '. |
| 194 | '<w deprel="poss" head="6" lemma="lHlPTQv" msd="SUBCAT_Prop|NUM_Sg|CASE_Gen|CASECHANGE_Up|OTHER_UNK" n="5" pos="N">lHlPTQv</w> '. |
| 195 | '<w deprel="dobj" head="3" lemma="IXxgORnMc" msd="NUM_Pl|CASE_Par|OTHER_UNK" n="6" pos="N">IXxgORnMc</w> '. |
| 196 | '<w deprel="cc" head="6" lemma="QdjQ" msd="SUBCAT_CC" n="7" pos="C">QdjQ</w> '. |
| 197 | '<w deprel="conj" head="6" lemma="luYMmwBGSUbXCMxqFzeZv" msd="NUM_Pl|CASE_Par|OTHER_UNK" n="8" pos="N">luYMmwBGSUbXCMxqFzeZv</w> '. |
| 198 | '<w deprel="punct" head="3" lemma="E" msd="_" n="9" pos="Punct">E</w>'. |
| 199 | '</s>'. |
| 200 | '</p>') |
| 201 | ), 'Parsed'); |
| 202 | |
| 203 | is($inline->data->data, 'lJgkPOGUBSFSRQlx rYuqciR RcidTBqv cHIf reuvyWZtUhN KsaXYaFo qJhgSDNOYpWg xtRyGN XCVuQwU hYwEsYDUbYHmJ yYXOYOqX LkrLYiYgRSC erRenLjillGtDCaRLIx c LSymCdojKTj vQ nHfBTtne fmcz lHlPTQv IXxgORnMc QdjQ luYMmwBGSUbXCMxqFzeZv E'); |
| 204 | |
| 205 | Test::XML::Loy->new($inline->dependencies->to_string('aaa', 3)) |
| 206 | ->attr_is('#s1_n3', 'l', "4") |
| 207 | ->attr_is('#s1_n3', 'from', 25) |
| 208 | ->attr_is('#s1_n3', 'to', 33) |
| 209 | ->attr_is('#s1_n3 rel', 'label', 'ROOT') |
| 210 | ->element_exists('#s1_n3 rel span[from=0]') |
| 211 | ->attr_is('#s1_n3 rel span', 'to', 144) |
| 212 | ->element_exists_not('#s1_n3 fs') |
| 213 | |
| 214 | ->attr_is('#s1_n14', 'l', "4") |
| 215 | ->attr_is('#s1_n14', 'from', 143) |
| 216 | ->attr_is('#s1_n14', 'to', 144) |
| 217 | ->attr_is('#s1_n14 rel', 'label', 'punct') |
| 218 | ->attr_is('#s1_n14 rel span', 'from', 25) |
| 219 | ->attr_is('#s1_n14 rel span', 'to', 33) |
| 220 | |
| 221 | ->attr_is('#s2_n1', 'l', "4") |
| 222 | ->attr_is('#s2_n1', 'from', 146) |
| 223 | ->attr_is('#s2_n1', 'to', 157) |
| 224 | ->attr_is('#s2_n1 rel', 'label', 'nommod') |
| 225 | ->attr_is('#s2_n1 rel span', 'from', 161) |
| 226 | ->attr_is('#s2_n1 rel span', 'to', 169) |
| 227 | |
| 228 | ->attr_is('#s2_n9', 'l', "4") |
| 229 | ->attr_is('#s2_n9', 'from', 220) |
| 230 | ->attr_is('#s2_n9', 'to', 221) |
| 231 | ->attr_is('#s2_n9 rel', 'label', 'punct') |
| 232 | ->attr_is('#s2_n9 rel span', 'from', 161) |
| 233 | ->attr_is('#s2_n9 rel span', 'to', 169) |
| 234 | |
| 235 | ->attr_is('#s2_n3', 'l', "4") |
| 236 | ->attr_is('#s2_n3', 'from', 161) |
| 237 | ->attr_is('#s2_n3', 'to', 169) |
| 238 | ->attr_is('#s2_n3 rel', 'label', 'ROOT') |
| 239 | ->attr_is('#s2_n3 rel span', 'from', 146) |
| 240 | ->attr_is('#s2_n3 rel span', 'to', 221) |
| 241 | ; |
| 242 | |
| 243 | Test::XML::Loy->new($inline->tokens->to_string('aaa', 1)) |
| 244 | ->attr_is('#s2', 'l', "4") |
| 245 | ->attr_is('#s2', 'from', 25) |
| 246 | ->attr_is('#s2', 'to', 33) |
| 247 | ->text_is('#s2 fs f[name="lemma"]', 'RcidTBqv') |
| 248 | ->text_is('#s2 fs f[name="pos"]', 'V') |
| 249 | ->text_is('#s2 fs f[name="msd"]', 'PRS_Sg3|VOICE_Act|TENSE_Prt|MOOD_Ind') |
| 250 | |
| 251 | ->attr_is('#s22', 'l', "4") |
| 252 | ->attr_is('#s22', 'from', 220) |
| 253 | ->attr_is('#s22', 'to', 221) |
| 254 | ->text_is('#s22 fs f[name="lemma"]', 'E') |
| 255 | ->text_is('#s22 fs f[name="pos"]', 'Punct') |
| 256 | ->text_is('#s22 fs f[name="msd"]', '_') |
| 257 | ; |
| 258 | |
| 259 | }; |
| 260 | |
Akron | 56b8dbd | 2021-02-26 11:23:48 +0100 | [diff] [blame] | 261 | subtest 'Examples from documentation' => sub { |
| 262 | plan skip_all => 'Expected behaviour not finalized'; |
| 263 | |
| 264 | # From the documentation: |
| 265 | # |
| 266 | # Example: |
| 267 | # '... <head type="main"><s>Campagne in Frankreich</s></head><head type="sub"> <s>1792</s> ...' |
| 268 | |
| 269 | # Two text-nodes should normally be separated by a blank. |
| 270 | # In the above example, that would be the 2 text-nodes |
| 271 | # 'Campagne in Frankreich' and '1792', which are separated |
| 272 | # by the whitespace-node ' ' (see [2]). |
| 273 | # |
| 274 | # The text-node 'Campagne in Frankreich' leads to the setting |
| 275 | # of '$add_one' to 1, so that when opening the 2nd 'head'-tag, |
| 276 | # it's from-index gets set to the correct start-index of '1792' |
| 277 | # (and not to the start-index of the whitespace-node ' '). |
| 278 | # |
| 279 | # The assumption here is, that in most cases there _is_ a |
| 280 | # whitespace node between 2 text-nodes. The below code fragment |
| 281 | # enables a way, to check, if this really _was_ the case for |
| 282 | # the last 2 'non-tag'-nodes, when closing a tag: |
| 283 | # |
| 284 | # When a whitespace-node is read, its from-index is stored |
| 285 | # as a hash-key (in %ws), to state that it belongs to a ws-node. |
| 286 | # So when closing a tag, it can be checked, if the previous |
| 287 | # 'non-tag'-node (text or whitespace), which is the one before |
| 288 | # the last read 'non-tag'-node, was a actually _not_ a ws-node, |
| 289 | # but instead a text-node. In that case, the from-value of |
| 290 | # the last read 'non-tag'-node has to be corrected (see [1]), |
| 291 | # |
| 292 | # For whitespace-nodes $add_one is set to 0, so when opening |
| 293 | # the next tag (in the above example the 2nd 's'-tag), no |
| 294 | # additional 1 is added (because this was already done by the |
| 295 | # whitespace-node itself when incrementing the variable $pos). |
| 296 | # |
| 297 | # [1] |
| 298 | # Now, what happens, when 2 text-nodes are _not_ seperated by a |
| 299 | # whitespace-node (e.g.: <w>Augen<c>,</c></w>)? |
| 300 | # In this case, the falsely increased from-value has to be |
| 301 | # decreased again by 1 when closing the enclosing tag |
| 302 | # (see above code fragment '... not exists $ws{ $from - 1 } ...'). |
| 303 | # |
| 304 | # [2] |
| 305 | # Comparing the 2 examples '<w>fu</w> <w>bar</w>' and |
| 306 | # '<w>fu</w><w> </w><w>bar</w>', is ' ' in both cases handled as a |
| 307 | # whitespace-node (XML_READER_TYPE_SIGNIFICANT_WHITESPACE). |
| 308 | # |
| 309 | # The from-index of the 2nd w-tag in the second example refers to |
| 310 | # 'bar', which may not have been the intention |
| 311 | # (even though '<w> </w>' doesn't make a lot of sense). |
| 312 | # TODO: could this be a bug? |
| 313 | # |
| 314 | # Empty tags also cling to the next text-token - e.g. in |
| 315 | # '<w>tok1</w> <w>tok2</w><a><b/></a> <w>tok3</w>' are the from- |
| 316 | # and to-indizes for the tags 'a' and 'b' both 12, |
| 317 | # which is the start-index of the token 'tok3'. |
| 318 | |
| 319 | ok($inline->parse( |
| 320 | 'bbb', |
| 321 | \'<head type="main"><s>Campagne in Frankreich</s></head><head type="sub"> <s>1792</s></head>'),'Parsed'); |
| 322 | is($inline->data->data, 'Campagne in Frankreich 1792'); |
| 323 | |
| 324 | Test::XML::Loy->new($inline->structures->to_string('aaa', 2)) |
| 325 | ->attr_is('#s0', 'l', "1") |
| 326 | ->attr_is('#s0', 'to', 27) |
| 327 | ->text_is('#s0 fs f[name="name"]', 'text') |
| 328 | |
| 329 | ->attr_is('#s1', 'l', "2") |
| 330 | ->attr_is('#s1', 'to', 22) |
| 331 | ->text_is('#s1 fs f[name="name"]', 'head') |
| 332 | ->text_is('#s1 fs f[name="attr"] fs f[name=type]', 'main') |
| 333 | |
| 334 | ->attr_is('#s2', 'l', "3") |
| 335 | ->attr_is('#s2', 'to', 22) |
| 336 | ->text_is('#s2 fs f[name="name"]', 's') |
| 337 | |
| 338 | ->attr_is('#s3', 'l', "2") |
| 339 | ->attr_is('#s3', 'from', 23) |
| 340 | ->attr_is('#s3', 'to', 27) |
| 341 | ->text_is('#s3 fs f[name="name"]', 'head') |
| 342 | ->text_is('#s3 fs f[name="attr"] fs f[name=type]', 'sub') |
| 343 | |
| 344 | ->attr_is('#s4', 'l', "3") |
| 345 | ->attr_is('#s4', 'from', 23) |
| 346 | ->attr_is('#s4', 'to', 27) |
| 347 | ->text_is('#s4 fs f[name="name"]', 's') |
| 348 | ; |
| 349 | |
| 350 | ok($inline->parse( |
| 351 | 'ccc', |
| 352 | \'<w>tok1</w> <w>tok2</w><a><b/></a> <w>tok3</w>' |
| 353 | ), 'Parsed'); |
| 354 | is($inline->data->data, 'tok1 tok2 tok3'); |
| 355 | |
| 356 | Test::XML::Loy->new($inline->structures->to_string('ccc', 2)) |
| 357 | ->attr_is('#s0', 'l', "1") |
| 358 | ->attr_is('#s0', 'to', 14) |
| 359 | ->text_is('#s0 fs f[name="name"]', 'text') |
| 360 | |
| 361 | ->attr_is('#s1', 'l', "2") |
| 362 | ->attr_is('#s1', 'to', 4) |
| 363 | ->text_is('#s1 fs f[name="name"]', 'w') |
| 364 | |
| 365 | ->attr_is('#s2', 'l', "2") |
| 366 | ->attr_is('#s2', 'from', 5) |
| 367 | ->attr_is('#s2', 'to', 9) |
| 368 | ->text_is('#s2 fs f[name="name"]', 'w') |
| 369 | |
| 370 | ->attr_is('#s2', 'l', "2") |
| 371 | ->attr_is('#s2', 'from', 5) |
| 372 | ->attr_is('#s2', 'to', 9) |
| 373 | ->text_is('#s2 fs f[name="name"]', 'w') |
| 374 | |
| 375 | ->attr_is('#s3', 'l', "2") |
| 376 | ->attr_is('#s3', 'from', 10) |
| 377 | ->attr_is('#s3', 'to', 10) |
| 378 | ->text_is('#s3 fs f[name="name"]', 'a') |
| 379 | |
| 380 | ->attr_is('#s4', 'l', "3") |
| 381 | ->attr_is('#s4', 'from', 10) |
| 382 | ->attr_is('#s4', 'to', 10) |
| 383 | ->text_is('#s4 fs f[name="name"]', 'b') |
| 384 | |
| 385 | ->attr_is('#s5', 'l', "2") |
| 386 | ->attr_is('#s5', 'from', 10) |
| 387 | ->attr_is('#s5', 'to', 14) |
| 388 | ->text_is('#s5 fs f[name="name"]', 'w') |
| 389 | ; |
| 390 | |
| 391 | ok($inline->parse( |
| 392 | 'ccc', |
| 393 | \'<w>Augen<c>,</c></w> <w>die</w>' |
| 394 | ), 'Parsed'); |
| 395 | is($inline->data->data, 'Augen, die'); |
| 396 | |
| 397 | Test::XML::Loy->new($inline->structures->to_string('ddd', 2)) |
| 398 | ->attr_is('#s0', 'l', "1") |
| 399 | ->attr_is('#s0', 'to', 10) |
| 400 | ->text_is('#s0 fs f[name="name"]', 'text') |
| 401 | |
| 402 | ->attr_is('#s1', 'l', "2") |
| 403 | ->attr_is('#s1', 'to', 6) |
| 404 | ->text_is('#s1 fs f[name="name"]', 'w') |
| 405 | |
| 406 | ->attr_is('#s2', 'l', "3") |
| 407 | ->attr_is('#s2', 'from', 5) |
| 408 | ->attr_is('#s2', 'to', 6) |
| 409 | ->text_is('#s2 fs f[name="name"]', 'c') |
| 410 | |
| 411 | ->attr_is('#s3', 'l', "2") |
| 412 | ->attr_is('#s3', 'from', 7) |
| 413 | ->attr_is('#s3', 'to', 10) |
| 414 | ->text_is('#s3 fs f[name="name"]', 'w') |
| 415 | ; |
| 416 | }; |
Akron | eb12e23 | 2021-02-25 13:49:50 +0100 | [diff] [blame] | 417 | |
Akron | e2819a1 | 2021-10-12 15:52:55 +0200 | [diff] [blame] | 418 | |
| 419 | subtest 'Treatment of tokens' => sub { |
| 420 | my $inline = KorAP::XML::TEI::Inline->new(0, {b => 1}, 1); |
| 421 | |
| 422 | ok($inline->parse('aaa', \'<a>Der</a> <b>alte</b> <w pos="NN">Baum</w>'), 'Parsed'); |
| 423 | is($inline->data->data, 'Der alte Baum'); |
| 424 | |
| 425 | # Only contains '<a>' |
| 426 | Test::XML::Loy->new($inline->structures->to_string('aaa', 1)) |
| 427 | ->attr_is('#s1', 'to', 3) |
| 428 | ->element_exists_not('#s2') |
| 429 | ; |
| 430 | |
| 431 | # Only contains 'w' |
| 432 | Test::XML::Loy->new($inline->tokens->to_string('aaa', 1)) |
| 433 | ->attr_is('#s0', 'from', 9) |
| 434 | ->attr_is('#s0', 'to', 13) |
| 435 | ->attr_is('#s0 > fs > f > fs > f', 'name', 'pos') |
| 436 | ->text_is('#s0 > fs > f > fs > f[name=pos]', 'NN') |
| 437 | ->element_exists_not('#s1') |
| 438 | ; |
| 439 | }; |
| 440 | |
Akron | eb12e23 | 2021-02-25 13:49:50 +0100 | [diff] [blame] | 441 | done_testing; |