Establish header object for corpus, doc and text header parsing
Change-Id: I26767fc27054bd3f1b70a622557c0b2f04cac816
diff --git a/lib/KorAP/XML/TEI/Header.pm b/lib/KorAP/XML/TEI/Header.pm
new file mode 100644
index 0000000..3d9c06d
--- /dev/null
+++ b/lib/KorAP/XML/TEI/Header.pm
@@ -0,0 +1,181 @@
+package KorAP::XML::TEI::Header;
+use strict;
+use warnings;
+use Encode qw(encode_utf8);
+
+# Parsing of i5 header files
+
+# Warning:
+# Opening and closing tags (without attributes) have to be in one line
+
+# TODO: IDS-specific
+my $_HEADER_TAG = 'idsHeader';
+
+use constant {
+ TEXT => 0,
+ HEADTYPE => 1,
+ SIGLE => 2
+};
+
+# convert '&', '<' and '>' into their corresponding sgml-entities
+our %ent = (
+ '"' => '"',
+ '&' => '&',
+ '<' => '<',
+ '>' => '>'
+);
+
+# convert header type to sigle type
+our %sig = (
+ corpus => 'korpusSigle',
+ document => 'dokumentSigle',
+ text => 'textSigle'
+);
+
+
+# Create new header object
+sub new {
+ my $class = shift;
+ my $text = shift;
+
+ my $self = bless [$text, undef, ''], $class;
+
+ # Check header types to distinguish between siglen types
+ if ($text =~ m!^<${_HEADER_TAG} [^<]*type="([^"]+)"!) {
+ $self->[HEADTYPE] = $1;
+ }
+
+ # Unexpected header init
+ else {
+ die "ERROR ($0): Unable to parse header init '$text'";
+ return;
+ };
+
+ return $self;
+};
+
+
+# Parse header object from filehandle
+sub parse {
+ my ($self, $fh) = @_;
+
+ my $sig_type = $sig{$self->[HEADTYPE]} // 'textSigle';
+
+ # Iterate over file handle
+ while (<$fh>) {
+
+ # Change:
+ # This version keeps comments in header files
+
+ # End of header found - finish parsing
+ if ( m!^(.*</${_HEADER_TAG}>)(.*)$! ){
+
+ # Add to text
+ $self->[TEXT] .= $1;
+
+ die "ERROR ($0): main(): input line number $.: line with closing header tag '${_HEADER_TAG}'"
+ ." contains additional information ... => Aborting\n\tline=$_"
+ if $2 !~ /^\s*$/;
+
+ if ($self->dir eq '') {
+
+ print STDERR "WARNING ($0): main(): input line number $.: empty " . $sig_type .
+ " in header => nothing to do ...\n header=" . $self->[TEXT] . "\n";
+ return;
+
+ };
+
+ return $self;
+ };
+
+ # Check for sigle in line
+ if ( m!^(.*)<$sig_type(?: [^>]*)?>([^<]*)(.*)$! ){
+
+ my $pfx = $1;
+ my $sig = $2;
+ my $sfx = $3;
+
+ die "ERROR ($0): main(): input line number $.: line with sigle-tag is not in expected format ... => Aborting\n\tline=$_"
+ if $pfx !~ /^\s*$/ || $sfx !~ m!^</$sig_type>\s*$! || $sig =~ /^\s*$/;
+
+ $self->[SIGLE] = encode_utf8($sig);
+
+ # Escape sig
+ my $sig_esc = $self->sigle_esc;
+
+ # replace sigle in header, if there's an escaped version that differs
+ s!(<$sig_type(?: [^>]*)?>)[^<]+</$sig_type>!$1$sig_esc</$sig_type>! if $sig_esc ne $sig;
+ };
+
+ # Add line to header text
+ $self->[TEXT] .= $_;
+ };
+};
+
+# Type of the header
+sub type {
+ $_[0]->[HEADTYPE];
+};
+
+
+# Directory (leveled) of the header file
+sub dir {
+ $_[0]->[SIGLE] =~ tr/\./\//r;
+};
+
+
+# corpus/doc/text sigle
+sub sigle {
+ $_[0]->[SIGLE];
+};
+
+
+# corpus/doc/text id
+sub id {
+ $_[0]->[SIGLE] =~ tr/\//_/r;
+};
+
+
+# corpus/doc/text sigle escaped
+sub sigle_esc {
+ $_[0]->[SIGLE] =~ s/("|&|<|>)/$ent{$1}/gr;
+};
+
+
+# corpus/doc/text id escaped
+sub id_esc {
+ $_[0]->[SIGLE] =~ tr/\//_/r =~ s/("|&|<|>)/$ent{$1}/gr;
+};
+
+
+# Return data as a string
+sub to_string {
+ my $self = shift;
+ return $self->_header . $self->[TEXT];
+};
+
+
+# Header for XML output
+sub _header {
+ my $self = shift;
+ # TODO: IDS-specific
+ return <<"HEADER";
+<?xml version="1.0" encoding="UTF-8"?>
+<?xml-model href="header.rng"
+ type="application/xml"
+ schematypens="http://relaxng.org/ns/structure/1.0"?>
+<!DOCTYPE idsCorpus PUBLIC "-//IDS//DTD IDS-XCES 1.0//EN"
+ "http://corpora.ids-mannheim.de/idsxces1/DTD/ids.xcesdoc.dtd">
+HEADER
+};
+
+
+# Write data to zip stream
+sub to_zip {
+ my ($self, $zip) = @_;
+ $zip->print(encode_utf8($self->to_string));
+};
+
+
+1;
+
diff --git a/script/tei2korapxml b/script/tei2korapxml
index 2b2c6da..5d2d9ff 100755
--- a/script/tei2korapxml
+++ b/script/tei2korapxml
@@ -23,6 +23,7 @@
use KorAP::XML::TEI::Tokenizer::Conservative;
use KorAP::XML::TEI::Tokenizer::Aggressive;
use KorAP::XML::TEI::Zipper;
+use KorAP::XML::TEI::Header;
our $VERSION = '0.01';
@@ -55,15 +56,6 @@
#
# ~~~ parameter (mandatory) ~~~
#
-
- # optional
-my $_CORP_SIGLE = "korpusSigle"; # opening and closing tags (without attributes) have to be in one line
- # (e.g.: <korpusSigle>GOE</korpusSigle>)
- # optional
-my $_DOC_SIGLE = "dokumentSigle"; # analog
- # mandatory
-my $_TEXT_SIGLE = "textSigle"; # analog
- # mandatory
my $_TEXT_BODY = "text"; # tag (without attributes), which contains the primary text
# optional
my $_CORP_HEADER_BEG = "idsHeader type=\"corpus\""; # just keep the correct order of the attributes and evtl. add an '.*' between them
@@ -140,8 +132,6 @@
my $data; # contains the primary text (created by func. 'retr_info' from $buf_in), which is written to '$data_file'
my $dir; # text directory (below $_root_dir)
-my $dir_crp; # corpus directory (below $_root_dir)
-my $dir_doc; # document directory (below $_root_dir)
my ( $text_id, $text_id_esc ); # '$text_id_esc' = escaped version of $text_id (see %ent)
@@ -150,15 +140,10 @@
# note: the index still refers to the 'single character'-versions, which are counted as 1
# (search for '&' in data.xml and see corresponding indices in $_tokens_file)
-my $header_txt; # raw text header (written to '$_root_dir$dir/$_header_file')
-my $header_doc; # raw document header (written to '$_root_dir$dir_doc/$_header_file')
-my $header_crp; # raw corpus header (written to '$_root_dir$dir_crp/$_header_file')
+my ( $data_fl );
-my ( $header_fl_crp, $header_fl_doc, # flags for tracking where we are in the input document
- $header_fl_txt, $data_fl );
+my ( $data_prfx1, $data_prfx2, $data_sfx ); # $data_* are written to $_data_file
-my ( $header_prfx, $data_prfx1, # $header_prfx is written to $_header_file, $data_* are written to $_data_file
- $data_prfx2, $data_sfx );
my @structures; # list of arrays, where each array represents a TEI I5 tag (except $_TOKENS_TAG) from the input document
# - the input of this array is written in func. 'write_structures' into the file '$_structure_file'
@@ -193,8 +178,6 @@
my ( $i, $c ); # index variables used in loops
-my ( $_CORP_HEADER_END, $_DOC_HEADER_END, $_TEXT_HEADER_END );
-
#
# ~~~ main ~~~
@@ -204,9 +187,7 @@
($_XCT_LN)?($_IDX=5):($_IDX=4);
-$header_prfx = $data_prfx1 = $data_prfx2 = $data_sfx = "";
-
-$header_fl_txt = $header_fl_doc = $header_fl_crp = 0;
+$data_prfx1 = $data_prfx2 = $data_sfx = "";
$inside_tokens_tag = -1;
@@ -215,15 +196,9 @@
$_root_dir .= '/'; # base dir must always end with a slash
$_root_dir =~ s/^\.?\///; # remove leading / (only relative paths allowed in IO::Compress::Zip) and redundant ./
-$_CORP_HEADER_BEG =~ s#^([^\s]+)(.*)$#$1\[\^>\]*$2#; $_CORP_HEADER_END = $1;
-$_DOC_HEADER_BEG =~ s#^([^\s]+)(.*)$#$1\[\^>\]*$2#; $_DOC_HEADER_END = $1;
-$_TEXT_HEADER_BEG =~ s#^([^\s]+)(.*)$#$1\[\^>\]*$2#; $_TEXT_HEADER_END = $1;
-
-## TODO: remove this, because it's IDS-specific
-$header_prfx = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
-$header_prfx .= "<?xml-model href=\"header.rng\" type=\"application/xml\" schematypens=\"http://relaxng.org/ns/structure/1.0\"?>\n";
-$header_prfx .= "<!DOCTYPE idsCorpus PUBLIC \"-//IDS//DTD IDS-XCES 1.0//EN\" \"http://corpora.ids-mannheim.de/idsxces1/DTD/ids.xcesdoc.dtd\">\n";
-##
+$_CORP_HEADER_BEG =~ s#^([^\s]+)(.*)$#$1\[\^>\]*$2#;
+$_DOC_HEADER_BEG =~ s#^([^\s]+)(.*)$#$1\[\^>\]*$2#;
+$_TEXT_HEADER_BEG =~ s#^([^\s]+)(.*)$#$1\[\^>\]*$2#;
$data_prfx1 = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
$data_prfx1 .= "<?xml-model href=\"text.rng\" type=\"application/xml\" schematypens=\"http://relaxng.org/ns/structure/1.0\"?>\n\n";
@@ -261,8 +236,7 @@
$data_fl = 0;
- $buf_in = $data = $dir = $dir_doc = $dir_crp = "";
- $header_txt = $header_doc = $header_crp = "";
+ $buf_in = $data = $dir = "";
if ( $input_fname ne '' ){
@@ -412,7 +386,7 @@
} else { # $dir eq ""
print STDERR "WARNING ($0): main(): maybe empty textSigle => skipping this text ...\n";
- print STDERR "WARNING ($0): main(): text header=$header_txt\n";
+ # print STDERR "WARNING ($0): main(): text header=$header_txt\n";
print STDERR "WARNING ($0): main(): data=$data\n";
}
@@ -469,152 +443,10 @@
# add line to buffer
$buf_in .= $_;
- } elsif ( $header_fl_txt && m#^(.*</${_TEXT_HEADER_END}>)(.*)$# ){
-
-
- # ~ end of text header ~
-
-
- #print STDERR "end of text header\n";
-
- # write it to header.xml
-
- $sfx = $2;
-
- $header_txt .= $1; $header_fl_txt = 0;
-
-
- die "ERROR ($0): main(): input line number $lc: line with closing text-header tag '${_TEXT_HEADER_END}'"
- ." contains additional information ... => Aborting\n\tline=$_"
- if $sfx !~ /^\s*$/;
-
- if ( $dir eq "" ){
-
- print STDERR "WARNING ($0): main(): input line number $lc: empty textSigle in text header => nothing to do ...\ntext header=$header_txt\n";
-
- } else {
-
- print STDERR "DEBUG ($0): Writing file $_root_dir$dir/$_header_file\n" if $_DEBUG;
-
- $header_txt = encode_utf8( $header_txt );
-
- $zipper->new_stream("$_root_dir$dir/$_header_file")
- ->print("$header_prfx$header_txt");
-
- $header_txt = "";
- }
-
- } elsif ( $header_fl_txt ){
-
- # ~ inside text header ~
-
-
- #print STDERR "inside text header\n";
-
- if( m#^(.*)<${_TEXT_SIGLE}(?: [^>]*)?>([^<]*)(.*)$# ){
-
- $pfx = $1; $sfx = $3;
-
- $dir = $2; $text_id = $dir;
-
- $text_id =~ tr/\//_/; $dir =~ s/("|&|<|>)/$ent{$1}/g;
-
- $text_id = encode_utf8( $text_id );
-
- die "ERROR ($0): main(): input line number $lc: line with text-sigle tag '$_TEXT_SIGLE' is not in expected format ... => Aborting\n\tline=$_"
- if $pfx !~ /^\s*$/ || $sfx !~ m#^</${_TEXT_SIGLE}>\s*$# || $dir =~ /^\s*$/;
-
- # log output for seeing progression
- print STDERR "$0: main(): text_id=".decode_utf8( $text_id )."\n";
-
- $text_id_esc = $text_id;
-
- s#(<${_TEXT_SIGLE}(?: [^>]*)?>)[^<]+(</${_TEXT_SIGLE}>)#$1$dir$2# # to be consistent with escaping, escape also textSigle in text-header
- if $text_id_esc =~ s/("|&|<|>)/$ent{$1}/g;
-
- $dir =~ tr/\./\//;
- }
-
- $header_txt .= $_;
-
- } elsif ( $header_fl_doc && m#^(.*</${_DOC_HEADER_END}>)(.*)$# ){
-
-
- # ~ end of document header ~
-
- #print STDERR "end of doc header\n";
-
- # write it to header.xml
-
- $sfx = $2;
-
- $header_doc .= $1; $header_fl_doc = 0;
-
- die "ERROR ($0): main(): input line number $lc: line with closing document-header tag '${_DOC_HEADER_END}'"
- ." contains additional information ... => Aborting\n\tline=$_"
- if $sfx !~ /^\s*$/;
-
- if( $dir_doc eq "" ){
-
- print STDERR "WARNING ($0): main(): input line number $lc: empty document sigle in document header"
- ." => nothing to do ...\ndocument header=$header_doc\n";
-
- } else {
-
- print STDERR "DEBUG ($0): Writing file $_root_dir$dir_doc/$_header_file\n" if $_DEBUG;
-
- $header_doc = encode_utf8( $header_doc );
-
- $zipper->new_stream("$_root_dir$dir_doc/$_header_file")
- ->print("$header_prfx$header_doc");
-
- $header_doc = $dir_doc = "";
- }
-
- } elsif ( $header_fl_doc ){
-
-
- # ~ inside document header ~
-
-
- #print STDERR "inside doc header\n";
-
- if ( m#^(.*)<${_DOC_SIGLE}(?: [^>]*)?>([^<]*)(.*)$# ){
-
- $pfx = $1; $sfx = $3;
-
- $dir_doc = $2;
-
- die "ERROR ($0): main(): input line number $lc: line with document-sigle tag '$_DOC_SIGLE' is not in expected format ... => Aborting\n\tline=$_"
- if $pfx !~ /^\s*$/ || $sfx !~ m#^</${_DOC_SIGLE}>\s*$# || $dir_doc =~ /^\s*$/;
-
- s#(<${_DOC_SIGLE}(?: [^>]*)?>)[^<]+(</${_DOC_SIGLE}>)#$1$dir_doc$2# # to be consistent with escaping, escape also textSigle in Document-Header
- if $dir_doc =~ s/("|&|<|>)/$ent{$1}/g;
- }
-
- $header_doc .= $_;
-
- } elsif ( m#^(.*)(<${_TEXT_HEADER_BEG}.*)$# ){
-
- # ~ start of text header ~
-
-
- #print STDERR "begin of text header\n";
-
- $header_txt = $_; $header_fl_txt = 1; $pfx = $1;
-
- $tl = 0; # reset (needed for ~ whitespace handling ~)
-
- die "ERROR ($0): main(): input line number $lc: line with opening text-header tag '${_TEXT_HEADER_BEG}'"
- ." is not in expected format ... => Aborting\n\tline=$_"
- if $pfx !~ /^\s*$/;
-
} elsif ( m#^(.*)<${_TEXT_BODY}(?: [^>]*)?>(.*)$# ){
-
# ~ start of text body ~
-
#print STDERR "inside text body\n";
$pfx = $1; $sfx = $2;
@@ -625,92 +457,44 @@
." contains additional information ... => Aborting\n\tline=$_"
if $pfx !~ /^\s*$/ || $sfx !~ /^\s*$/;
- } elsif ( m#^(.*)(<${_DOC_HEADER_BEG}.*)$# ){
+ } elsif ( m#^(.*)(<(?:${_TEXT_HEADER_BEG}|${_DOC_HEADER_BEG}|${_CORP_HEADER_BEG}).*)$# ){
+ # ~ start of header ~
+ $pfx = $1;
+ my $content = "$2\n";
- # ~ start of document header ~
-
-
- #print STDERR "begin of doc header\n";
-
- $header_doc = "$2\n"; $header_fl_doc = 1; $pfx = $1;
-
- die "ERROR ($0): main(): input line number $lc: line with opening document-header tag '${_DOC_HEADER_BEG}'"
- ."is not in expected format ... => Aborting\n\tline=$_"
- if $pfx !~ /^\s*$/;
-
- } elsif ( $header_fl_crp && m#^(.*</${_CORP_HEADER_END}>)(.*)$# ){
-
-
- # ~ end of corpus header ~
-
-
- #print STDERR "end of corp header\n";
-
- $sfx = $2;
-
- $header_crp .= $1; $header_fl_crp = 0;
-
- die "ERROR ($0): main(): input line number $lc: line with closing corpus-header tag '${_CORP_HEADER_END}'"
- ." contains additional information ... => Aborting\n\tline=$_"
- if $sfx !~ /^\s*$/;
-
- if ( $dir_crp eq "" ){
-
- print STDERR "WARNING ($0): main(): input line number $lc: empty corpus sigle in corpus header => nothing to do ...\ncorpus header=$header_crp\n";
-
- } else {
-
- print STDERR "DEBUG ($0): Writing file $_root_dir$dir_crp/$_header_file\n" if $_DEBUG;
-
- $header_crp = encode_utf8( $header_crp );
-
- $zipper->new_stream("$_root_dir$dir_crp/$_header_file")
- ->print("$header_prfx$header_crp");
-
- $header_crp = $dir_crp = "";
- }
-
- } elsif ( $header_fl_crp ){
-
-
- # ~ inside corpus header ~
-
-
- #print STDERR "inside corp header\n";
-
- if ( m#^(.*)<${_CORP_SIGLE}(?: [^>]*)?>([^<]*)(.*)$# ){
-
- $pfx = $1; $sfx = $3;
-
- $dir_crp = $2;
-
- die "ERROR ($0): main(): input line number $lc: line with korpusSigle-tag is not in expected format ... => Aborting\n\tline=$_"
- if $pfx !~ /^\s*$/ || $sfx !~ m#^</${_CORP_SIGLE}>\s*$# || $dir_crp =~ /^\s*$/;
-
- if ( $dir_crp =~ s/("|&|<|>)/$ent{$1}/g ){
-
- s#(<${_CORP_SIGLE}(?: [^>]*)?>)[^<]+(</${_CORP_SIGLE}>)#$1$dir_crp$2# # to be consistent with escaping, escape also textSigle in Corpus-Header
- }
- }
-
- $header_crp .= $_;
-
- } elsif ( m#^(.*)(<${_CORP_HEADER_BEG}.*)$# ){
-
-
- # ~ start of corpus header ~
-
-
- #print STDERR "begin of corp header\n";
-
- $header_crp = $2; $header_fl_crp = 1; $pfx = $1;
-
- die "ERROR ($0): main(): input line number $lc: line with opening corpus-header tag '${_CORP_HEADER_BEG}'"
+ die "ERROR ($0): main(): input line number $lc: line with opening header tag"
." is not in expected format ... => Aborting\n\tline=$_"
if $pfx !~ /^\s*$/;
- }
+ # Parse header
+ my $header = KorAP::XML::TEI::Header->new($content)->parse($input_fh);
+
+ # Header was parseable
+ if ($header) {
+
+ # Write header to zip
+ my $file = $_root_dir . $header->dir . '/' . $_header_file;
+
+ print STDERR "DEBUG ($0): Writing file $file\n" if $_DEBUG;
+
+ $header->to_zip($zipper->new_stream($file));
+
+ # Header is for text level
+ if ($header->type eq 'text') {
+
+ # Remember dir and sigles
+ $dir = $header->dir;
+ $text_id = $header->id;
+ $text_id_esc = $header->id_esc;
+
+ # log output for seeing progression
+ print STDERR "$0: main(): text_id=".decode_utf8( $text_id )."\n";
+
+ $tl = 0; # reset (needed for ~ whitespace handling ~)
+ };
+ }
+ }
} #end: while
$zipper->close;
diff --git a/t/header.t b/t/header.t
new file mode 100644
index 0000000..0012e75
--- /dev/null
+++ b/t/header.t
@@ -0,0 +1,143 @@
+use strict;
+use warnings;
+
+use FindBin;
+BEGIN {
+ unshift @INC, "$FindBin::Bin/../lib";
+};
+
+use Test::More;
+use Test::KorAP::XML::TEI qw!korap_tempfile!;
+
+require_ok('KorAP::XML::TEI::Header');
+
+my $h;
+
+eval { $h = KorAP::XML::TEI::Header->new('<idsHeader>') };
+
+ok(!$h, 'Header invalid');
+
+subtest 'Corpus Header' => sub {
+ $h = KorAP::XML::TEI::Header->new('<idsHeader type="corpus">');
+ ok($h, 'Header valid');
+
+ is($h->sigle, '', 'Check sigle');
+ is($h->sigle_esc, '', 'Check sigle escaped');
+ is($h->dir, '', 'Check dir');
+ is($h->type, 'corpus', 'Check dir');
+ like($h->to_string, qr!^<\?xml version!, 'String');
+ like($h->to_string, qr!<idsHeader type=\"corpus\">$!, 'String');
+
+ my ($fh, $filename) = korap_tempfile('header_1');
+
+ print $fh <<'HTML';
+<-- mehrzeiliger
+Kommentar
+ --> <fileDesc>
+ <titleStmt>
+ <korpusSigle>GOE</korpusSigle>
+ <c.title>Goethe-Korpus</c.title>
+ </titleStmt>
+</idsHeader>
+Test
+HTML
+
+ seek($fh, 0, 0);
+
+ ok($h->parse($fh), 'Parsing');
+
+ like($h->to_string, qr!^<\?xml version!, 'String');
+ like($h->to_string, qr!<idsHeader type=\"corpus\">!, 'String');
+ like($h->to_string, qr!<-- mehrzeiliger!, 'String');
+ like($h->to_string, qr!titleStmt!, 'String');
+ like($h->to_string, qr!</idsHeader>$!, 'String');
+
+ is($h->sigle, 'GOE', 'Check sigle');
+ is($h->sigle_esc, 'GOE', 'Check sigle escaped');
+ is($h->id, 'GOE', 'Check sigle');
+ is($h->id_esc, 'GOE', 'Check sigle escaped');
+ is($h->dir, 'GOE', 'Check dir');
+ is($h->type, 'corpus', 'Check type');
+};
+
+subtest 'Document Header' => sub {
+ $h = KorAP::XML::TEI::Header->new('<idsHeader type="document">');
+ ok($h, 'Header valid');
+
+ is($h->sigle, '', 'Check sigle');
+ is($h->sigle_esc, '', 'Check sigle escaped');
+ is($h->dir, '', 'Check dir');
+ like($h->to_string, qr!^<\?xml version!, 'String');
+ like($h->to_string, qr!<idsHeader type=\"document\">$!, 'String');
+
+ my ($fh, $filename) = korap_tempfile('header_2');
+
+ print $fh <<'HTML';
+ <fileDesc>
+ <titleStmt>
+ <dokumentSigle>GOE/"AAA"</dokumentSigle>
+ </titleStmt>
+</idsHeader>
+Test
+HTML
+
+ seek($fh, 0, 0);
+
+ ok($h->parse($fh), 'Parsing');
+
+ like($h->to_string, qr!^<\?xml version!, 'String');
+ like($h->to_string, qr!<idsHeader type=\"document\">!, 'String');
+ like($h->to_string, qr!titleStmt!, 'String');
+ like($h->to_string, qr!</idsHeader>$!, 'String');
+
+ is($h->sigle, 'GOE/"AAA"', 'Check sigle');
+ is($h->sigle_esc, 'GOE/"AAA"', 'Check sigle escaped');
+ is($h->id, 'GOE_"AAA"', 'Check sigle');
+ is($h->id_esc, 'GOE_"AAA"', 'Check sigle escaped');
+ is($h->dir, 'GOE/"AAA"', 'Check dir');
+ is($h->type, 'document', 'Check type');
+};
+
+
+subtest 'Text Header' => sub {
+ $h = KorAP::XML::TEI::Header->new('<idsHeader foo="bar" type="text">');
+ ok($h, 'Header valid');
+
+ is($h->sigle, '', 'Check sigle');
+ is($h->sigle_esc, '', 'Check sigle escaped');
+ is($h->dir, '', 'Check dir');
+ like($h->to_string, qr!^<\?xml version!, 'String');
+ like($h->to_string, qr!<idsHeader foo="bar" type=\"text\">$!, 'String');
+
+ my ($fh, $filename) = korap_tempfile('header_3');
+
+ print $fh <<'HTML';
+ <fileDesc>
+ <titleStmt>
+ <textSigle>GOE/"AAA".00003</textSigle>
+ </titleStmt>
+</idsHeader>
+Test
+HTML
+
+ seek($fh, 0, 0);
+
+ ok($h->parse($fh), 'Parsing');
+
+ like($h->to_string, qr!^<\?xml version!, 'String');
+ like($h->to_string, qr!<idsHeader foo="bar" type=\"text\">!, 'String');
+ like($h->to_string, qr!titleStmt!, 'String');
+ like($h->to_string, qr!</idsHeader>$!, 'String');
+
+ like($h->to_string, qr!GOE/"AAA"\.00003!, 'String');
+
+ is($h->sigle, 'GOE/"AAA".00003', 'Check sigle');
+ is($h->sigle_esc, 'GOE/"AAA".00003', 'Check sigle escaped');
+ is($h->id, 'GOE_"AAA".00003', 'Check sigle');
+ is($h->id_esc, 'GOE_"AAA".00003', 'Check sigle escaped');
+ is($h->dir, 'GOE/"AAA"/00003', 'Check dir');
+ is($h->type, 'text', 'Check type');
+};
+
+
+done_testing;