| package KorAP::XML::TEI::Header; |
| use strict; |
| use warnings; |
| use Log::Any qw($log); |
| use Encode qw(encode decode); |
| use KorAP::XML::TEI qw!escape_xml!; |
| use KorAP::XML::TEI qw!remove_xml_comments replace_entities!; |
| |
| # Parsing of i5 header files |
| |
| # Warning: |
| # Opening and closing tags (without attributes) have to be in one line |
| |
| # TODO: IDS-specific (and redundant) |
| my $_HEADER_TAG = 'idsHeader'; |
| |
| use constant { |
| TEXT => 0, |
| HEADTYPE => 1, |
| SIGLE => 2, |
| INPUTENC => 3 |
| }; |
| |
| |
| # convert header type to sigle type |
| our %sig = ( |
| corpus => 'korpusSigle', |
| document => 'dokumentSigle', |
| text => 'textSigle' |
| ); |
| |
| |
| # Create new header object |
| sub new { |
| my ($class, $text, $input_enc) = @_; |
| |
| my $self = bless [$text, undef, '', $input_enc // 'UTF-8'], $class; |
| |
| # Check header types to distinguish between siglen types |
| if ($text =~ m!^<${_HEADER_TAG}\s+[^<]*type="([^"]+)"!) { |
| $self->[HEADTYPE] = $1; |
| } |
| |
| # Unexpected header init |
| else { |
| die $log->fatal("Unable to parse header init '$text'"); |
| return; |
| }; |
| |
| return $self; |
| }; |
| |
| |
| # Parse header object from filehandle |
| sub parse { |
| my ($self, $fh) = @_; |
| |
| my $sig_type = $sig{$self->[HEADTYPE]} // 'textSigle'; |
| |
| my $pos; |
| my $l = length('</' . $_HEADER_TAG) + 1; |
| |
| # Iterate over file handle |
| while (<$fh>) { |
| |
| $_ = decode($self->[INPUTENC], $_); |
| $_ = replace_entities($_); |
| |
| # Change: |
| # This version keeps comments in header files |
| |
| # End of header found - finish parsing |
| if (($pos = index($_, '</' . $_HEADER_TAG)) >= 0) { |
| |
| # Add to text |
| $self->[TEXT] .= substr($_, 0, $l + $pos); |
| |
| die $log->fatal("Line with tag '</${_HEADER_TAG}>' (L$.) contains additional information") |
| if substr($_, $l + $pos) !~ /^\s*$/; |
| |
| if ($self->dir eq '') { |
| $log->error("Empty '<$sig_type />' (L$.) in header"); |
| return; |
| }; |
| |
| return $self; |
| }; |
| |
| # Check for sigle in line |
| if (index($_, '<' . $sig_type) >= 0) { |
| |
| unless (m!^\s*<$sig_type[^>]*>([^<]*)</$sig_type>\s*$!) { |
| die $log->fatal("line with '<$sig_type />' (L$.) is not in expected format"); |
| }; |
| |
| $self->[SIGLE] = encode('UTF-8' , $1); |
| |
| # Escape sig |
| my $sig_esc = decode('UTF-8', $self->sigle_esc); |
| |
| # replace sigle in header, if there's an escaped version that differs |
| s!$1</$sig_type>!$sig_esc</$sig_type>! if $sig_esc ne $1; |
| }; |
| |
| # Add line to header text |
| $self->[TEXT] .= $_; |
| }; |
| }; |
| |
| |
| # Type of the header |
| sub type { |
| $_[0]->[HEADTYPE]; |
| }; |
| |
| |
| # Directory (leveled) of the header file as UTF-8 |
| sub dir { |
| $_[0]->[SIGLE] =~ tr/\./\//r; |
| }; |
| |
| |
| # corpus/doc/text sigle - as UTF-8 |
| sub sigle { |
| $_[0]->[SIGLE]; |
| }; |
| |
| |
| # corpus/doc/text id |
| sub id { |
| decode('UTF-8', $_[0]->[SIGLE] =~ tr/\//_/r); |
| }; |
| |
| |
| # corpus/doc/text sigle escaped - as UTF-8 |
| sub sigle_esc { |
| escape_xml($_[0]->[SIGLE]); |
| }; |
| |
| |
| # corpus/doc/text id escaped |
| sub id_esc { |
| escape_xml($_[0]->id); |
| }; |
| |
| |
| # Return data as a string |
| sub to_string { |
| return $_[0]->_header . $_[0]->[TEXT]; |
| }; |
| |
| |
| # Header for XML output |
| sub _header { |
| # TODO: IDS-specific |
| return <<"HEADER"; |
| <?xml version="1.0" encoding="UTF-8"?> |
| <?xml-model href="header.rng" |
| type="application/xml" |
| schematypens="http://relaxng.org/ns/structure/1.0"?> |
| <!DOCTYPE idsCorpus PUBLIC "-//IDS//DTD IDS-XCES 1.0//EN" |
| "http://corpora.ids-mannheim.de/idsxces1/DTD/ids.xcesdoc.dtd"> |
| HEADER |
| }; |
| |
| |
| # Write data to zip stream |
| sub to_zip { |
| my ($self, $zip) = @_; |
| $zip->print(encode('UTF-8', $self->to_string)); |
| return $self; |
| }; |
| |
| |
| 1; |
| |