Blame - t/inline.t - KorAP/KorAP-XML-TEI

blob: 76ff74d67941120803d6146b4219acd07e92e4a6 [file] [log] [blame]

Akron	eb12e23	2021-02-25 13:49:50 +0100	[diff] [blame]	1	use strict;
				2	use warnings;
				3
				4	use FindBin;
				5	BEGIN {
				6	unshift @INC, "$FindBin::Bin/../lib";
				7	};
				8
				9	use Test::More;
				10	use Test::XML::Loy;
				11	use_ok('KorAP::XML::TEI::Inline');
				12
				13
				14	my $inline = KorAP::XML::TEI::Inline->new;
				15
				16	ok($inline->parse('aaa', \'Der <b>alte</b> Mann'), 'Parsed');
				17
				18	is($inline->data->data, 'Der alte Mann');
				19
				20	Test::XML::Loy->new($inline->structures->to_string('aaa', 2))
				21	->attr_is('#s0', 'l', "1")
				22	->attr_is('#s0', 'to', 13)
				23	->text_is('#s0 fs f[name=name]', 'text')
				24	->attr_is('#s1', 'l', "2")
				25	->attr_is('#s1', 'from', 4)
				26	->attr_is('#s1', 'to', 8)
				27	->text_is('#s1 fs f[name=name]', 'b')
				28	;
				29
				30	Test::XML::Loy->new($inline->tokens->to_string('aaa', 0))
				31	->element_exists_not('fs')
				32	;
				33
				34
				35	ok($inline->parse('aaa', \'<w>Die</w> <w>alte</w> <w>Frau</w>'), 'Parsed');
				36
				37	is($inline->data->data, 'Die alte Frau');
				38
				39	Test::XML::Loy->new($inline->structures->to_string('aaa', 2))
				40	->attr_is('#s0', 'l', "1")
				41	->attr_is('#s0', 'to', 13)
				42	->text_is('#s0 fs f[name=name]', 'text')
				43
				44	->attr_is('#s1', 'l', "2")
				45	->attr_is('#s1', 'to', 3)
				46	->text_is('#s1 fs f[name=name]', 'w')
				47
				48	->attr_is('#s2', 'l', "2")
				49	->attr_is('#s2', 'from', 4)
				50	->attr_is('#s2', 'to', 8)
				51	->text_is('#s2 fs f[name=name]', 'w')
				52
				53	->attr_is('#s3', 'l', "2")
				54	->attr_is('#s3', 'from', 9)
				55	->attr_is('#s3', 'to', 13)
				56	->text_is('#s3 fs f[name=name]', 'w')
				57	;
				58
				59	Test::XML::Loy->new($inline->tokens->to_string('aaa', 0))
				60	->attr_is('#s0', 'l', "2")
				61	->attr_is('#s0', 'to', 3)
				62
				63	->attr_is('#s1', 'l', "2")
				64	->attr_is('#s1', 'from', 4)
				65	->attr_is('#s1', 'to', 8)
				66
				67	->attr_is('#s2', 'l', "2")
				68	->attr_is('#s2', 'from', 9)
				69	->attr_is('#s2', 'to', 13)
				70	;
				71
				72	ok($inline->parse('aaa', \'<w lemma="die" type="det">Die</w> <w
				73	lemma="alt" type="ADJ">alte</w> <w lemma="frau" type="NN">Frau</w>'), 'Parsed');
				74
				75	is($inline->data->data, 'Die alte Frau');
				76
				77	Test::XML::Loy->new($inline->tokens->to_string('aaa', 1))
				78	->attr_is('#s0', 'l', "2")
				79	->attr_is('#s0', 'to', 3)
				80	->text_is('#s0 fs f[name="lemma"]', 'die')
				81	->text_is('#s0 fs f[name="type"]', 'det')
				82
				83	->attr_is('#s1', 'l', "2")
				84	->attr_is('#s1', 'from', 4)
				85	->attr_is('#s1', 'to', 8)
				86	->text_is('#s1 fs f[name="lemma"]', 'alt')
				87	->text_is('#s1 fs f[name="type"]', 'ADJ')
				88
				89	->attr_is('#s2', 'l', "2")
				90	->attr_is('#s2', 'from', 9)
				91	->attr_is('#s2', 'to', 13)
				92	->text_is('#s2 fs f[name="lemma"]', 'frau')
				93	->text_is('#s2 fs f[name="type"]', 'NN')
				94	;
				95
Akron	56b8dbd	2021-02-26 11:23:48 +0100	[diff] [blame]	96	subtest 'Examples from documentation' => sub {
				97	plan skip_all => 'Expected behaviour not finalized';
				98
				99	# From the documentation:
				100	#
				101	# Example:
				102	# '... <head type="main"><s>Campagne in Frankreich</s></head><head type="sub"> <s>1792</s> ...'
				103
				104	# Two text-nodes should normally be separated by a blank.
				105	# In the above example, that would be the 2 text-nodes
				106	# 'Campagne in Frankreich' and '1792', which are separated
				107	# by the whitespace-node ' ' (see [2]).
				108	#
				109	# The text-node 'Campagne in Frankreich' leads to the setting
				110	# of '$add_one' to 1, so that when opening the 2nd 'head'-tag,
				111	# it's from-index gets set to the correct start-index of '1792'
				112	# (and not to the start-index of the whitespace-node ' ').
				113	#
				114	# The assumption here is, that in most cases there _is_ a
				115	# whitespace node between 2 text-nodes. The below code fragment
				116	# enables a way, to check, if this really _was_ the case for
				117	# the last 2 'non-tag'-nodes, when closing a tag:
				118	#
				119	# When a whitespace-node is read, its from-index is stored
				120	# as a hash-key (in %ws), to state that it belongs to a ws-node.
				121	# So when closing a tag, it can be checked, if the previous
				122	# 'non-tag'-node (text or whitespace), which is the one before
				123	# the last read 'non-tag'-node, was a actually _not_ a ws-node,
				124	# but instead a text-node. In that case, the from-value of
				125	# the last read 'non-tag'-node has to be corrected (see [1]),
				126	#
				127	# For whitespace-nodes $add_one is set to 0, so when opening
				128	# the next tag (in the above example the 2nd 's'-tag), no
				129	# additional 1 is added (because this was already done by the
				130	# whitespace-node itself when incrementing the variable $pos).
				131	#
				132	# [1]
				133	# Now, what happens, when 2 text-nodes are _not_ seperated by a
				134	# whitespace-node (e.g.: <w>Augen<c>,</c></w>)?
				135	# In this case, the falsely increased from-value has to be
				136	# decreased again by 1 when closing the enclosing tag
				137	# (see above code fragment '... not exists $ws{ $from - 1 } ...').
				138	#
				139	# [2]
				140	# Comparing the 2 examples '<w>fu</w> <w>bar</w>' and
				141	# '<w>fu</w><w> </w><w>bar</w>', is ' ' in both cases handled as a
				142	# whitespace-node (XML_READER_TYPE_SIGNIFICANT_WHITESPACE).
				143	#
				144	# The from-index of the 2nd w-tag in the second example refers to
				145	# 'bar', which may not have been the intention
				146	# (even though '<w> </w>' doesn't make a lot of sense).
				147	# TODO: could this be a bug?
				148	#
				149	# Empty tags also cling to the next text-token - e.g. in
				150	# '<w>tok1</w> <w>tok2</w><a><b/></a> <w>tok3</w>' are the from-
				151	# and to-indizes for the tags 'a' and 'b' both 12,
				152	# which is the start-index of the token 'tok3'.
				153
				154	ok($inline->parse(
				155	'bbb',
				156	\'<head type="main"><s>Campagne in Frankreich</s></head><head type="sub"> <s>1792</s></head>'),'Parsed');
				157	is($inline->data->data, 'Campagne in Frankreich 1792');
				158
				159	Test::XML::Loy->new($inline->structures->to_string('aaa', 2))
				160	->attr_is('#s0', 'l', "1")
				161	->attr_is('#s0', 'to', 27)
				162	->text_is('#s0 fs f[name="name"]', 'text')
				163
				164	->attr_is('#s1', 'l', "2")
				165	->attr_is('#s1', 'to', 22)
				166	->text_is('#s1 fs f[name="name"]', 'head')
				167	->text_is('#s1 fs f[name="attr"] fs f[name=type]', 'main')
				168
				169	->attr_is('#s2', 'l', "3")
				170	->attr_is('#s2', 'to', 22)
				171	->text_is('#s2 fs f[name="name"]', 's')
				172
				173	->attr_is('#s3', 'l', "2")
				174	->attr_is('#s3', 'from', 23)
				175	->attr_is('#s3', 'to', 27)
				176	->text_is('#s3 fs f[name="name"]', 'head')
				177	->text_is('#s3 fs f[name="attr"] fs f[name=type]', 'sub')
				178
				179	->attr_is('#s4', 'l', "3")
				180	->attr_is('#s4', 'from', 23)
				181	->attr_is('#s4', 'to', 27)
				182	->text_is('#s4 fs f[name="name"]', 's')
				183	;
				184
				185	ok($inline->parse(
				186	'ccc',
				187	\'<w>tok1</w> <w>tok2</w><a><b/></a> <w>tok3</w>'
				188	), 'Parsed');
				189	is($inline->data->data, 'tok1 tok2 tok3');
				190
				191	Test::XML::Loy->new($inline->structures->to_string('ccc', 2))
				192	->attr_is('#s0', 'l', "1")
				193	->attr_is('#s0', 'to', 14)
				194	->text_is('#s0 fs f[name="name"]', 'text')
				195
				196	->attr_is('#s1', 'l', "2")
				197	->attr_is('#s1', 'to', 4)
				198	->text_is('#s1 fs f[name="name"]', 'w')
				199
				200	->attr_is('#s2', 'l', "2")
				201	->attr_is('#s2', 'from', 5)
				202	->attr_is('#s2', 'to', 9)
				203	->text_is('#s2 fs f[name="name"]', 'w')
				204
				205	->attr_is('#s2', 'l', "2")
				206	->attr_is('#s2', 'from', 5)
				207	->attr_is('#s2', 'to', 9)
				208	->text_is('#s2 fs f[name="name"]', 'w')
				209
				210	->attr_is('#s3', 'l', "2")
				211	->attr_is('#s3', 'from', 10)
				212	->attr_is('#s3', 'to', 10)
				213	->text_is('#s3 fs f[name="name"]', 'a')
				214
				215	->attr_is('#s4', 'l', "3")
				216	->attr_is('#s4', 'from', 10)
				217	->attr_is('#s4', 'to', 10)
				218	->text_is('#s4 fs f[name="name"]', 'b')
				219
				220	->attr_is('#s5', 'l', "2")
				221	->attr_is('#s5', 'from', 10)
				222	->attr_is('#s5', 'to', 14)
				223	->text_is('#s5 fs f[name="name"]', 'w')
				224	;
				225
				226	ok($inline->parse(
				227	'ccc',
				228	\'<w>Augen<c>,</c></w> <w>die</w>'
				229	), 'Parsed');
				230	is($inline->data->data, 'Augen, die');
				231
				232	Test::XML::Loy->new($inline->structures->to_string('ddd', 2))
				233	->attr_is('#s0', 'l', "1")
				234	->attr_is('#s0', 'to', 10)
				235	->text_is('#s0 fs f[name="name"]', 'text')
				236
				237	->attr_is('#s1', 'l', "2")
				238	->attr_is('#s1', 'to', 6)
				239	->text_is('#s1 fs f[name="name"]', 'w')
				240
				241	->attr_is('#s2', 'l', "3")
				242	->attr_is('#s2', 'from', 5)
				243	->attr_is('#s2', 'to', 6)
				244	->text_is('#s2 fs f[name="name"]', 'c')
				245
				246	->attr_is('#s3', 'l', "2")
				247	->attr_is('#s3', 'from', 7)
				248	->attr_is('#s3', 'to', 10)
				249	->text_is('#s3 fs f[name="name"]', 'w')
				250	;
				251	};
Akron	eb12e23	2021-02-25 13:49:50 +0100	[diff] [blame]	252
Akron	e2819a1	2021-10-12 15:52:55 +0200	[diff] [blame]	253
				254	subtest 'Treatment of tokens' => sub {
				255	my $inline = KorAP::XML::TEI::Inline->new(0, {b => 1}, 1);
				256
				257	ok($inline->parse('aaa', \'<a>Der</a> <b>alte</b> <w pos="NN">Baum</w>'), 'Parsed');
				258	is($inline->data->data, 'Der alte Baum');
				259
				260	# Only contains '<a>'
				261	Test::XML::Loy->new($inline->structures->to_string('aaa', 1))
				262	->attr_is('#s1', 'to', 3)
				263	->element_exists_not('#s2')
				264	;
				265
				266	# Only contains 'w'
				267	Test::XML::Loy->new($inline->tokens->to_string('aaa', 1))
				268	->attr_is('#s0', 'from', 9)
				269	->attr_is('#s0', 'to', 13)
				270	->attr_is('#s0 > fs > f > fs > f', 'name', 'pos')
				271	->text_is('#s0 > fs > f > fs > f[name=pos]', 'NN')
				272	->element_exists_not('#s1')
				273	;
				274	};
				275
Akron	eb12e23	2021-02-25 13:49:50 +0100	[diff] [blame]	276	done_testing;