blob: 76ff74d67941120803d6146b4219acd07e92e4a6 [file] [log] [blame]
Akroneb12e232021-02-25 13:49:50 +01001use strict;
2use warnings;
3
4use FindBin;
5BEGIN {
6 unshift @INC, "$FindBin::Bin/../lib";
7};
8
9use Test::More;
10use Test::XML::Loy;
11use_ok('KorAP::XML::TEI::Inline');
12
13
14my $inline = KorAP::XML::TEI::Inline->new;
15
16ok($inline->parse('aaa', \'Der <b>alte</b> Mann'), 'Parsed');
17
18is($inline->data->data, 'Der alte Mann');
19
20Test::XML::Loy->new($inline->structures->to_string('aaa', 2))
21 ->attr_is('#s0', 'l', "1")
22 ->attr_is('#s0', 'to', 13)
23 ->text_is('#s0 fs f[name=name]', 'text')
24 ->attr_is('#s1', 'l', "2")
25 ->attr_is('#s1', 'from', 4)
26 ->attr_is('#s1', 'to', 8)
27 ->text_is('#s1 fs f[name=name]', 'b')
28 ;
29
30Test::XML::Loy->new($inline->tokens->to_string('aaa', 0))
31 ->element_exists_not('fs')
32 ;
33
34
35ok($inline->parse('aaa', \'<w>Die</w> <w>alte</w> <w>Frau</w>'), 'Parsed');
36
37is($inline->data->data, 'Die alte Frau');
38
39Test::XML::Loy->new($inline->structures->to_string('aaa', 2))
40 ->attr_is('#s0', 'l', "1")
41 ->attr_is('#s0', 'to', 13)
42 ->text_is('#s0 fs f[name=name]', 'text')
43
44 ->attr_is('#s1', 'l', "2")
45 ->attr_is('#s1', 'to', 3)
46 ->text_is('#s1 fs f[name=name]', 'w')
47
48 ->attr_is('#s2', 'l', "2")
49 ->attr_is('#s2', 'from', 4)
50 ->attr_is('#s2', 'to', 8)
51 ->text_is('#s2 fs f[name=name]', 'w')
52
53 ->attr_is('#s3', 'l', "2")
54 ->attr_is('#s3', 'from', 9)
55 ->attr_is('#s3', 'to', 13)
56 ->text_is('#s3 fs f[name=name]', 'w')
57 ;
58
59Test::XML::Loy->new($inline->tokens->to_string('aaa', 0))
60 ->attr_is('#s0', 'l', "2")
61 ->attr_is('#s0', 'to', 3)
62
63 ->attr_is('#s1', 'l', "2")
64 ->attr_is('#s1', 'from', 4)
65 ->attr_is('#s1', 'to', 8)
66
67 ->attr_is('#s2', 'l', "2")
68 ->attr_is('#s2', 'from', 9)
69 ->attr_is('#s2', 'to', 13)
70 ;
71
72ok($inline->parse('aaa', \'<w lemma="die" type="det">Die</w> <w
73 lemma="alt" type="ADJ">alte</w> <w lemma="frau" type="NN">Frau</w>'), 'Parsed');
74
75is($inline->data->data, 'Die alte Frau');
76
77Test::XML::Loy->new($inline->tokens->to_string('aaa', 1))
78 ->attr_is('#s0', 'l', "2")
79 ->attr_is('#s0', 'to', 3)
80 ->text_is('#s0 fs f[name="lemma"]', 'die')
81 ->text_is('#s0 fs f[name="type"]', 'det')
82
83 ->attr_is('#s1', 'l', "2")
84 ->attr_is('#s1', 'from', 4)
85 ->attr_is('#s1', 'to', 8)
86 ->text_is('#s1 fs f[name="lemma"]', 'alt')
87 ->text_is('#s1 fs f[name="type"]', 'ADJ')
88
89 ->attr_is('#s2', 'l', "2")
90 ->attr_is('#s2', 'from', 9)
91 ->attr_is('#s2', 'to', 13)
92 ->text_is('#s2 fs f[name="lemma"]', 'frau')
93 ->text_is('#s2 fs f[name="type"]', 'NN')
94 ;
95
Akron56b8dbd2021-02-26 11:23:48 +010096subtest 'Examples from documentation' => sub {
97 plan skip_all => 'Expected behaviour not finalized';
98
99 # From the documentation:
100 #
101 # Example:
102 # '... <head type="main"><s>Campagne in Frankreich</s></head><head type="sub"> <s>1792</s> ...'
103
104 # Two text-nodes should normally be separated by a blank.
105 # In the above example, that would be the 2 text-nodes
106 # 'Campagne in Frankreich' and '1792', which are separated
107 # by the whitespace-node ' ' (see [2]).
108 #
109 # The text-node 'Campagne in Frankreich' leads to the setting
110 # of '$add_one' to 1, so that when opening the 2nd 'head'-tag,
111 # it's from-index gets set to the correct start-index of '1792'
112 # (and not to the start-index of the whitespace-node ' ').
113 #
114 # The assumption here is, that in most cases there _is_ a
115 # whitespace node between 2 text-nodes. The below code fragment
116 # enables a way, to check, if this really _was_ the case for
117 # the last 2 'non-tag'-nodes, when closing a tag:
118 #
119 # When a whitespace-node is read, its from-index is stored
120 # as a hash-key (in %ws), to state that it belongs to a ws-node.
121 # So when closing a tag, it can be checked, if the previous
122 # 'non-tag'-node (text or whitespace), which is the one before
123 # the last read 'non-tag'-node, was a actually _not_ a ws-node,
124 # but instead a text-node. In that case, the from-value of
125 # the last read 'non-tag'-node has to be corrected (see [1]),
126 #
127 # For whitespace-nodes $add_one is set to 0, so when opening
128 # the next tag (in the above example the 2nd 's'-tag), no
129 # additional 1 is added (because this was already done by the
130 # whitespace-node itself when incrementing the variable $pos).
131 #
132 # [1]
133 # Now, what happens, when 2 text-nodes are _not_ seperated by a
134 # whitespace-node (e.g.: <w>Augen<c>,</c></w>)?
135 # In this case, the falsely increased from-value has to be
136 # decreased again by 1 when closing the enclosing tag
137 # (see above code fragment '... not exists $ws{ $from - 1 } ...').
138 #
139 # [2]
140 # Comparing the 2 examples '<w>fu</w> <w>bar</w>' and
141 # '<w>fu</w><w> </w><w>bar</w>', is ' ' in both cases handled as a
142 # whitespace-node (XML_READER_TYPE_SIGNIFICANT_WHITESPACE).
143 #
144 # The from-index of the 2nd w-tag in the second example refers to
145 # 'bar', which may not have been the intention
146 # (even though '<w> </w>' doesn't make a lot of sense).
147 # TODO: could this be a bug?
148 #
149 # Empty tags also cling to the next text-token - e.g. in
150 # '<w>tok1</w> <w>tok2</w><a><b/></a> <w>tok3</w>' are the from-
151 # and to-indizes for the tags 'a' and 'b' both 12,
152 # which is the start-index of the token 'tok3'.
153
154 ok($inline->parse(
155 'bbb',
156 \'<head type="main"><s>Campagne in Frankreich</s></head><head type="sub"> <s>1792</s></head>'),'Parsed');
157 is($inline->data->data, 'Campagne in Frankreich 1792');
158
159 Test::XML::Loy->new($inline->structures->to_string('aaa', 2))
160 ->attr_is('#s0', 'l', "1")
161 ->attr_is('#s0', 'to', 27)
162 ->text_is('#s0 fs f[name="name"]', 'text')
163
164 ->attr_is('#s1', 'l', "2")
165 ->attr_is('#s1', 'to', 22)
166 ->text_is('#s1 fs f[name="name"]', 'head')
167 ->text_is('#s1 fs f[name="attr"] fs f[name=type]', 'main')
168
169 ->attr_is('#s2', 'l', "3")
170 ->attr_is('#s2', 'to', 22)
171 ->text_is('#s2 fs f[name="name"]', 's')
172
173 ->attr_is('#s3', 'l', "2")
174 ->attr_is('#s3', 'from', 23)
175 ->attr_is('#s3', 'to', 27)
176 ->text_is('#s3 fs f[name="name"]', 'head')
177 ->text_is('#s3 fs f[name="attr"] fs f[name=type]', 'sub')
178
179 ->attr_is('#s4', 'l', "3")
180 ->attr_is('#s4', 'from', 23)
181 ->attr_is('#s4', 'to', 27)
182 ->text_is('#s4 fs f[name="name"]', 's')
183 ;
184
185 ok($inline->parse(
186 'ccc',
187 \'<w>tok1</w> <w>tok2</w><a><b/></a> <w>tok3</w>'
188 ), 'Parsed');
189 is($inline->data->data, 'tok1 tok2 tok3');
190
191 Test::XML::Loy->new($inline->structures->to_string('ccc', 2))
192 ->attr_is('#s0', 'l', "1")
193 ->attr_is('#s0', 'to', 14)
194 ->text_is('#s0 fs f[name="name"]', 'text')
195
196 ->attr_is('#s1', 'l', "2")
197 ->attr_is('#s1', 'to', 4)
198 ->text_is('#s1 fs f[name="name"]', 'w')
199
200 ->attr_is('#s2', 'l', "2")
201 ->attr_is('#s2', 'from', 5)
202 ->attr_is('#s2', 'to', 9)
203 ->text_is('#s2 fs f[name="name"]', 'w')
204
205 ->attr_is('#s2', 'l', "2")
206 ->attr_is('#s2', 'from', 5)
207 ->attr_is('#s2', 'to', 9)
208 ->text_is('#s2 fs f[name="name"]', 'w')
209
210 ->attr_is('#s3', 'l', "2")
211 ->attr_is('#s3', 'from', 10)
212 ->attr_is('#s3', 'to', 10)
213 ->text_is('#s3 fs f[name="name"]', 'a')
214
215 ->attr_is('#s4', 'l', "3")
216 ->attr_is('#s4', 'from', 10)
217 ->attr_is('#s4', 'to', 10)
218 ->text_is('#s4 fs f[name="name"]', 'b')
219
220 ->attr_is('#s5', 'l', "2")
221 ->attr_is('#s5', 'from', 10)
222 ->attr_is('#s5', 'to', 14)
223 ->text_is('#s5 fs f[name="name"]', 'w')
224 ;
225
226 ok($inline->parse(
227 'ccc',
228 \'<w>Augen<c>,</c></w> <w>die</w>'
229 ), 'Parsed');
230 is($inline->data->data, 'Augen, die');
231
232 Test::XML::Loy->new($inline->structures->to_string('ddd', 2))
233 ->attr_is('#s0', 'l', "1")
234 ->attr_is('#s0', 'to', 10)
235 ->text_is('#s0 fs f[name="name"]', 'text')
236
237 ->attr_is('#s1', 'l', "2")
238 ->attr_is('#s1', 'to', 6)
239 ->text_is('#s1 fs f[name="name"]', 'w')
240
241 ->attr_is('#s2', 'l', "3")
242 ->attr_is('#s2', 'from', 5)
243 ->attr_is('#s2', 'to', 6)
244 ->text_is('#s2 fs f[name="name"]', 'c')
245
246 ->attr_is('#s3', 'l', "2")
247 ->attr_is('#s3', 'from', 7)
248 ->attr_is('#s3', 'to', 10)
249 ->text_is('#s3 fs f[name="name"]', 'w')
250 ;
251};
Akroneb12e232021-02-25 13:49:50 +0100252
Akrone2819a12021-10-12 15:52:55 +0200253
254subtest 'Treatment of tokens' => sub {
255 my $inline = KorAP::XML::TEI::Inline->new(0, {b => 1}, 1);
256
257 ok($inline->parse('aaa', \'<a>Der</a> <b>alte</b> <w pos="NN">Baum</w>'), 'Parsed');
258 is($inline->data->data, 'Der alte Baum');
259
260 # Only contains '<a>'
261 Test::XML::Loy->new($inline->structures->to_string('aaa', 1))
262 ->attr_is('#s1', 'to', 3)
263 ->element_exists_not('#s2')
264 ;
265
266 # Only contains 'w'
267 Test::XML::Loy->new($inline->tokens->to_string('aaa', 1))
268 ->attr_is('#s0', 'from', 9)
269 ->attr_is('#s0', 'to', 13)
270 ->attr_is('#s0 > fs > f > fs > f', 'name', 'pos')
271 ->text_is('#s0 > fs > f > fs > f[name=pos]', 'NN')
272 ->element_exists_not('#s1')
273 ;
274};
275
Akroneb12e232021-02-25 13:49:50 +0100276done_testing;