Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 1 | package KorAP::Document; |
| 2 | use Mojo::Base -base; |
| 3 | use v5.16; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 4 | use Mojo::ByteStream 'b'; |
Nils Diewald | 7b84722 | 2014-04-23 11:14:00 +0000 | [diff] [blame] | 5 | use Mojo::Util qw/encode/; |
Nils Diewald | 3cf08c7 | 2013-12-16 20:31:10 +0000 | [diff] [blame] | 6 | use XML::Fast; |
| 7 | use Try::Tiny; |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 8 | use Carp qw/croak/; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 9 | use KorAP::Document::Primary; |
Nils Diewald | 7b84722 | 2014-04-23 11:14:00 +0000 | [diff] [blame] | 10 | use Log::Log4perl; |
| 11 | use KorAP::Log; |
| 12 | use Mojo::DOM; |
| 13 | use Data::Dumper; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 14 | |
Nils Diewald | 8e323ee | 2014-04-23 17:28:14 +0000 | [diff] [blame] | 15 | our @ATTR = qw/id |
| 16 | corpus_id |
| 17 | pub_date |
| 18 | title |
| 19 | sub_title |
| 20 | pub_place/; |
| 21 | |
| 22 | our @ADVANCED_ATTR = qw/publisher |
| 23 | editor |
| 24 | text_type |
| 25 | text_type_art |
| 26 | creation_date |
| 27 | coll_title |
| 28 | coll_sub_title |
| 29 | coll_author |
| 30 | coll_editor |
| 31 | /; |
| 32 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 33 | has 'path'; |
Nils Diewald | 8e323ee | 2014-04-23 17:28:14 +0000 | [diff] [blame] | 34 | has [@ATTR, @ADVANCED_ATTR]; |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 35 | |
Nils Diewald | 7b84722 | 2014-04-23 11:14:00 +0000 | [diff] [blame] | 36 | has log => sub { |
| 37 | if(Log::Log4perl->initialized()) { |
| 38 | state $log = Log::Log4perl->get_logger(__PACKAGE__); |
Nils Diewald | 7b84722 | 2014-04-23 11:14:00 +0000 | [diff] [blame] | 39 | }; |
| 40 | state $log = KorAP::Log->new; |
| 41 | return $log; |
| 42 | }; |
| 43 | |
| 44 | sub new { |
| 45 | my $class = shift; |
| 46 | my $self = bless { @_ }, $class; |
| 47 | if (exists $self->{path} && $self->{path} !~ m!\/$!) { |
| 48 | $self->{path} .= '/'; |
| 49 | }; |
| 50 | return $self; |
| 51 | }; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 52 | |
| 53 | # parse document |
| 54 | sub parse { |
| 55 | my $self = shift; |
Nils Diewald | 7b84722 | 2014-04-23 11:14:00 +0000 | [diff] [blame] | 56 | |
Nils Diewald | 98767bb | 2014-04-25 20:31:19 +0000 | [diff] [blame] | 57 | my $data_xml = $self->path . 'data.xml'; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 58 | |
Nils Diewald | 98767bb | 2014-04-25 20:31:19 +0000 | [diff] [blame] | 59 | my ($rt, $error, $file); |
| 60 | |
| 61 | my $unable = 'Unable to parse document ' . $self->path; |
| 62 | |
| 63 | unless (-e $data_xml) { |
| 64 | $self->log->warn($unable . ' - no data.xml found'); |
| 65 | $error = 1; |
| 66 | } |
| 67 | |
| 68 | else { |
| 69 | $file = b($data_xml)->slurp; |
| 70 | |
| 71 | try { |
Nils Diewald | 3cf08c7 | 2013-12-16 20:31:10 +0000 | [diff] [blame] | 72 | local $SIG{__WARN__} = sub { |
Nils Diewald | 98767bb | 2014-04-25 20:31:19 +0000 | [diff] [blame] | 73 | $error = 1; |
Nils Diewald | 3cf08c7 | 2013-12-16 20:31:10 +0000 | [diff] [blame] | 74 | }; |
| 75 | $rt = xml2hash($file, text => '#text', attr => '-')->{raw_text}; |
Nils Diewald | 98767bb | 2014-04-25 20:31:19 +0000 | [diff] [blame] | 76 | } |
| 77 | catch { |
| 78 | $self->log->warn($unable); |
| 79 | $error = 1; |
| 80 | }; |
Nils Diewald | 3cf08c7 | 2013-12-16 20:31:10 +0000 | [diff] [blame] | 81 | }; |
| 82 | |
| 83 | return if $error; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 84 | |
Nils Diewald | 3ece630 | 2013-12-02 18:38:16 +0000 | [diff] [blame] | 85 | $self->log->debug('Parse document ' . $self->path); |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 86 | |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 87 | # Get document id and corpus id |
Nils Diewald | 3cf08c7 | 2013-12-16 20:31:10 +0000 | [diff] [blame] | 88 | if ($rt && $rt->{'-docid'}) { |
| 89 | $self->id($rt->{'-docid'}); |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 90 | if ($self->id =~ /^([^_]+)_/) { |
| 91 | $self->corpus_id($1); |
| 92 | } |
| 93 | else { |
Nils Diewald | 3ece630 | 2013-12-02 18:38:16 +0000 | [diff] [blame] | 94 | croak $unable . ': ID not parseable'; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 95 | }; |
| 96 | } |
| 97 | else { |
Nils Diewald | 3ece630 | 2013-12-02 18:38:16 +0000 | [diff] [blame] | 98 | croak $unable . ': No raw_text found or no ID'; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 99 | }; |
| 100 | |
| 101 | # Get primary data |
Nils Diewald | 3cf08c7 | 2013-12-16 20:31:10 +0000 | [diff] [blame] | 102 | my $pd = $rt->{text}; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 103 | if ($pd) { |
Nils Diewald | 3cf08c7 | 2013-12-16 20:31:10 +0000 | [diff] [blame] | 104 | $self->{pd} = KorAP::Document::Primary->new($pd); |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 105 | } |
| 106 | else { |
| 107 | croak $unable; |
| 108 | }; |
| 109 | |
| 110 | # Get meta data |
| 111 | $self->_parse_meta; |
| 112 | return 1; |
| 113 | }; |
| 114 | |
| 115 | |
| 116 | # Primary data |
| 117 | sub primary { |
| 118 | $_[0]->{pd}; |
| 119 | }; |
| 120 | |
| 121 | sub author { |
| 122 | my $self = shift; |
| 123 | |
| 124 | # Set authors |
| 125 | if ($_[0]) { |
| 126 | return $self->{authors} = [ |
| 127 | grep { $_ !~ m{^\s*u\.a\.\s*$} } split(/;\s+/, shift()) |
| 128 | ]; |
| 129 | } |
| 130 | return ($self->{authors} // []); |
| 131 | }; |
| 132 | |
| 133 | sub text_class { |
| 134 | my $self = shift; |
| 135 | if ($_[0]) { |
| 136 | return $self->{topics} = [ @_ ]; |
| 137 | }; |
| 138 | return ($self->{topics} // []); |
| 139 | }; |
| 140 | |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 141 | sub _parse_meta { |
| 142 | my $self = shift; |
| 143 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 144 | my $file = b($self->path . 'header.xml')->slurp->decode('iso-8859-1'); |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 145 | |
| 146 | my $dom = Mojo::DOM->new($file); |
Nils Diewald | 682feb0 | 2013-11-29 22:48:40 +0000 | [diff] [blame] | 147 | my $analytic = $dom->at('analytic'); |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 148 | |
Nils Diewald | 8e323ee | 2014-04-23 17:28:14 +0000 | [diff] [blame] | 149 | if ($analytic) { |
| 150 | # Get title |
| 151 | my $title = $analytic->at('h\.title[type=main]'); |
| 152 | $self->title($title->text) if $title; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 153 | |
Nils Diewald | 8e323ee | 2014-04-23 17:28:14 +0000 | [diff] [blame] | 154 | # Get Subtitle |
| 155 | my $sub_title = $analytic->at('h\.title[type=sub]'); |
| 156 | $self->sub_title($sub_title->text) if $sub_title; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 157 | |
Nils Diewald | 8e323ee | 2014-04-23 17:28:14 +0000 | [diff] [blame] | 158 | # Get Author |
| 159 | my $author = $analytic->at('h\.author'); |
| 160 | $self->author($author->all_text) if $author; |
| 161 | |
| 162 | # Get Editor |
| 163 | my $editor = $analytic->at('editor'); |
| 164 | $self->editor($editor->all_text) if $editor; |
| 165 | }; |
| 166 | |
| 167 | # Get PubPlace |
| 168 | my $place = $dom->at('pubPlace'); |
| 169 | $self->pub_place($place->all_text) if $place; |
| 170 | |
| 171 | # Get Publisher |
| 172 | my $publisher = $dom->at('publisher'); |
| 173 | $self->publisher($publisher->all_text) if $publisher; |
| 174 | |
| 175 | my $mono = $dom->at('monogr'); |
| 176 | if ($mono) { |
| 177 | # Get title |
| 178 | my $title = $mono->at('h\.title[type=main]'); |
| 179 | |
| 180 | # It's a monograph |
| 181 | if (!$self->title) { |
| 182 | $self->title($title->text) if $title; |
| 183 | |
| 184 | # Get Subtitle |
| 185 | my $sub_title = $mono->at('h\.title[type=sub]'); |
| 186 | $self->sub_title($sub_title->text) if $sub_title; |
| 187 | |
| 188 | } |
| 189 | else { |
| 190 | $self->coll_title($title->text) if $title; |
| 191 | |
| 192 | # Get Subtitle |
| 193 | my $sub_title = $mono->at('h\.title[type=sub]'); |
| 194 | $self->coll_sub_title($sub_title->text) if $sub_title; |
| 195 | }; |
| 196 | |
| 197 | # Get Author |
| 198 | my $author = $mono->at('h\.author'); |
| 199 | $self->coll_author($author->all_text) if $author; |
| 200 | |
| 201 | # Get editor |
| 202 | my $editor = $mono->at('editor'); |
| 203 | $self->coll_editor($editor->all_text) if $editor; |
| 204 | }; |
| 205 | |
| 206 | # Get text type |
| 207 | my $text_type = $dom->at('textDesc textType'); |
| 208 | $self->text_type($text_type->all_text) if $text_type; |
| 209 | |
| 210 | # Get text type |
| 211 | my $text_type_art = $dom->at('textDesc textTypeArt'); |
| 212 | $self->text_type_art($text_type_art->all_text) if $text_type_art; |
| 213 | |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 214 | |
| 215 | # Get pubDate |
| 216 | my $year = $dom->at("pubDate[type=year]"); |
| 217 | $year = $year ? $year->text : 0; |
| 218 | my $month = $dom->at("pubDate[type=month]"); |
| 219 | $month = $month ? $month->text : 0; |
| 220 | my $day = $dom->at("pubDate[type=day]"); |
| 221 | $day = $day ? $day->text : 0; |
| 222 | |
Nils Diewald | 092178e | 2013-11-26 16:18:48 +0000 | [diff] [blame] | 223 | $year = 0 if $year !~ /^\d+$/; |
| 224 | $month = 0 if $month !~ /^\d+$/; |
| 225 | $day = 0 if $day !~ /^\d+$/; |
| 226 | |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 227 | my $date = $year ? ($year < 100 ? '20' . $year : $year) : '0000'; |
| 228 | $date .= length($month) == 1 ? '0' . $month : $month; |
| 229 | $date .= length($day) == 1 ? '0' . $day : $day; |
| 230 | |
| 231 | $self->pub_date($date); |
| 232 | |
Nils Diewald | 8e323ee | 2014-04-23 17:28:14 +0000 | [diff] [blame] | 233 | # creatDate |
| 234 | my $createdate = $dom->at('creatDate'); |
| 235 | if ($createdate) { |
| 236 | $createdate = $createdate->all_text; |
| 237 | if (index($createdate, '-') > -1) { |
| 238 | $self->log->warn("Creation date ranges are not supported yet"); |
| 239 | } |
| 240 | else { |
| 241 | $createdate =~ s{^(\d{4})$}{$1\.00}; |
| 242 | $createdate =~ s{^(\d{4})\.(\d{2})$}{$1\.$2\.00}; |
| 243 | if ($createdate =~ /^\d{4}\.\d{2}\.\d{2}$/) { |
| 244 | $createdate =~ tr/\.//d; |
| 245 | $self->creation_date($createdate); |
| 246 | }; |
| 247 | }; |
| 248 | }; |
| 249 | |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 250 | # Get textClasses |
| 251 | my @topic; |
| 252 | $dom->find("textClass catRef")->each( |
| 253 | sub { |
| 254 | my ($ign, @ttopic) = split('\.', $_->attr('target')); |
| 255 | push(@topic, @ttopic); |
| 256 | } |
| 257 | ); |
| 258 | $self->text_class(@topic); |
| 259 | }; |
| 260 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 261 | sub to_string { |
| 262 | my $self = shift; |
| 263 | |
| 264 | my $string; |
| 265 | |
| 266 | foreach (@ATTR) { |
| 267 | if (my $att = $self->$_) { |
| 268 | $att =~ s/\n/ /g; |
| 269 | $att =~ s/\s\s+/ /g; |
| 270 | $string .= $_ . ' = ' . $att . "\n"; |
| 271 | }; |
| 272 | }; |
| 273 | |
| 274 | if ($self->author) { |
| 275 | foreach (@{$self->author}) { |
| 276 | $_ =~ s/\n/ /g; |
| 277 | $_ =~ s/\s\s+/ /g; |
| 278 | $string .= 'author = ' . $_ . "\n"; |
| 279 | }; |
| 280 | }; |
| 281 | |
| 282 | if ($self->text_class) { |
| 283 | foreach (@{$self->text_class}) { |
| 284 | $string .= 'text_class = ' . $_ . "\n"; |
| 285 | }; |
| 286 | }; |
| 287 | |
| 288 | return $string; |
| 289 | }; |
| 290 | |
Nils Diewald | 044c41d | 2013-11-11 21:45:09 +0000 | [diff] [blame] | 291 | sub _k { |
| 292 | my $x = $_[0]; |
| 293 | $x =~ s/_(\w)/\U$1\E/g; |
| 294 | $x =~ s/id$/ID/gi; |
| 295 | return $x; |
| 296 | }; |
| 297 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 298 | |
| 299 | sub to_hash { |
| 300 | my $self = shift; |
| 301 | |
Nils Diewald | 7b84722 | 2014-04-23 11:14:00 +0000 | [diff] [blame] | 302 | $self->parse unless $self->id; |
| 303 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 304 | my %hash; |
| 305 | |
Nils Diewald | 044c41d | 2013-11-11 21:45:09 +0000 | [diff] [blame] | 306 | foreach (@ATTR) { |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 307 | if (my $att = $self->$_) { |
| 308 | $att =~ s/\n/ /g; |
| 309 | $att =~ s/\s\s+/ /g; |
Nils Diewald | 044c41d | 2013-11-11 21:45:09 +0000 | [diff] [blame] | 310 | $hash{_k($_)} = $att; |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 311 | }; |
| 312 | }; |
| 313 | |
Nils Diewald | 37e5b57 | 2013-11-20 20:26:03 +0000 | [diff] [blame] | 314 | for ('author') { |
Nils Diewald | 044c41d | 2013-11-11 21:45:09 +0000 | [diff] [blame] | 315 | $hash{_k($_)} = join(',', @{ $self->$_ }); |
| 316 | }; |
| 317 | |
Nils Diewald | 37e5b57 | 2013-11-20 20:26:03 +0000 | [diff] [blame] | 318 | for ('text_class') { |
| 319 | $hash{_k($_)} = join(' ', @{ $self->$_ }); |
| 320 | }; |
| 321 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 322 | return \%hash; |
| 323 | }; |
| 324 | |
| 325 | |
Nils Diewald | 7b84722 | 2014-04-23 11:14:00 +0000 | [diff] [blame] | 326 | |
| 327 | sub _parse_meta_fast { |
| 328 | my $self = shift; |
| 329 | |
| 330 | # my $file = b($self->path . 'header.xml')->slurp->decode('iso-8859-1'); |
| 331 | my $file = b($self->path . 'header.xml')->slurp; |
| 332 | |
| 333 | my ($meta, $error); |
Nils Diewald | 98767bb | 2014-04-25 20:31:19 +0000 | [diff] [blame] | 334 | my $unable = 'Unable to parse document ' . $self->path; |
Nils Diewald | 7b84722 | 2014-04-23 11:14:00 +0000 | [diff] [blame] | 335 | |
| 336 | try { |
| 337 | local $SIG{__WARN__} = sub { |
| 338 | $error = 1; |
| 339 | }; |
| 340 | $meta = xml2hash($file, text => '#text', attr => '-', array => ['h.title', 'imprint', 'catRef', 'h.author'])->{idsHeader}; |
| 341 | } |
| 342 | catch { |
| 343 | $self->log->warn($unable); |
| 344 | $error = 1; |
| 345 | }; |
| 346 | |
| 347 | return if $error; |
| 348 | |
| 349 | my $bibl_struct = $meta->{fileDesc}->{sourceDesc}->{biblStruct}; |
| 350 | my $analytic = $bibl_struct->{analytic}; |
| 351 | |
| 352 | my $titles = $analytic->{'h.title'}; |
| 353 | foreach (@$titles) { |
| 354 | if ($_->{'-type'} eq 'main') { |
| 355 | $self->title($_->{'#text'}); |
| 356 | } |
| 357 | elsif ($_->{'-type'} eq 'sub') { |
| 358 | $self->sub_title($_->{'#text'}); |
| 359 | }; |
| 360 | }; |
| 361 | |
| 362 | # Get Author |
| 363 | if (my $author = $analytic->{'h.author'}) { |
| 364 | $self->author($author->[0]); |
| 365 | }; |
| 366 | |
| 367 | # Get pubDate |
| 368 | my $date = $bibl_struct->{monogr}->{imprint}; |
| 369 | my ($year, $month, $day) = (0,0,0); |
| 370 | foreach (@$date) { |
| 371 | warn $date; |
| 372 | if ($date->{-type} eq 'year') { |
| 373 | $year = $date->{'#text'}; |
| 374 | } |
| 375 | elsif ($date->{-type} eq 'month') { |
| 376 | $month = $date->{'#text'}; |
| 377 | } |
| 378 | elsif ($date->{-type} eq 'day') { |
| 379 | $day = $date->{'#text'}; |
| 380 | }; |
| 381 | }; |
| 382 | |
| 383 | $year = 0 if $year !~ /^\d+$/; |
| 384 | $month = 0 if $month !~ /^\d+$/; |
| 385 | $day = 0 if $day !~ /^\d+$/; |
| 386 | |
| 387 | $date = $year ? ($year < 100 ? '20' . $year : $year) : '0000'; |
| 388 | $date .= length($month) == 1 ? '0' . $month : $month; |
| 389 | $date .= length($day) == 1 ? '0' . $day : $day; |
| 390 | |
| 391 | $self->pub_date($date); |
| 392 | |
| 393 | # Get textClasses |
| 394 | my @topic; |
| 395 | my $textClass = $meta->{profileDesc}->{textClass}->{catRef}; |
| 396 | foreach (@$textClass) { |
| 397 | my ($ign, @ttopic) = split('\.', $_->{'-target'}); |
| 398 | push(@topic, @ttopic); |
| 399 | }; |
| 400 | $self->text_class(@topic); |
| 401 | }; |
| 402 | |
| 403 | |
| 404 | |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 405 | 1; |
| 406 | |
| 407 | |
| 408 | __END__ |
| 409 | |
| 410 | =pod |
| 411 | |
| 412 | =head1 NAME |
| 413 | |
| 414 | KorAP::Document |
| 415 | |
| 416 | |
| 417 | =head1 SYNOPSIS |
| 418 | |
| 419 | my $doc = KorAP::Document->new( |
| 420 | path => 'mydoc-1/' |
| 421 | ); |
| 422 | |
| 423 | $doc->parse; |
| 424 | |
| 425 | print $doc->title; |
| 426 | |
| 427 | |
| 428 | =head1 DESCRIPTION |
| 429 | |
| 430 | Parse the primary and meta data of a document. |
| 431 | |
| 432 | |
| 433 | =head2 ATTRIBUTES |
| 434 | |
| 435 | =head2 id |
| 436 | |
| 437 | $doc->id(75476); |
| 438 | print $doc->id; |
| 439 | |
| 440 | The unique identifier of the document. |
| 441 | |
| 442 | |
| 443 | =head2 corpus_id |
| 444 | |
| 445 | $doc->corpus_id(4); |
| 446 | print $doc->corpus_id; |
| 447 | |
| 448 | The unique identifier of the corpus. |
| 449 | |
| 450 | |
| 451 | =head2 path |
| 452 | |
| 453 | $doc->path("example-004/"); |
| 454 | print $doc->path; |
| 455 | |
| 456 | The path of the document. |
| 457 | |
| 458 | |
| 459 | =head2 title |
| 460 | |
| 461 | $doc->title("Der Name der Rose"); |
| 462 | print $doc->title; |
| 463 | |
| 464 | The title of the document. |
| 465 | |
| 466 | |
| 467 | =head2 sub_title |
| 468 | |
| 469 | $doc->sub_title("Natürlich eine Handschrift"); |
| 470 | print $doc->sub_title; |
| 471 | |
| 472 | The title of the document. |
| 473 | |
| 474 | |
| 475 | =head2 pub_place |
| 476 | |
| 477 | $doc->pub_place("Rom"); |
| 478 | print $doc->pub_place; |
| 479 | |
| 480 | The publication place of the document. |
| 481 | |
| 482 | |
| 483 | =head2 pub_date |
| 484 | |
| 485 | $doc->pub_place("19800404"); |
| 486 | print $doc->pub_place; |
| 487 | |
| 488 | The publication date of the document, |
| 489 | in the format "YYYYMMDD". |
| 490 | |
| 491 | |
| 492 | =head2 primary |
| 493 | |
| 494 | print $doc->primary->data(0,20); |
| 495 | |
| 496 | The L<KorAP::Document::Primary> object containing the primary data. |
| 497 | |
| 498 | |
| 499 | =head2 author |
| 500 | |
| 501 | $doc->author('Binks, Jar Jar; Luke Skywalker'); |
| 502 | print $doc->author->[0]; |
| 503 | |
| 504 | Set the author value as semikolon separated list of names or |
| 505 | get an array reference of author names. |
| 506 | |
| 507 | =head2 text_class |
| 508 | |
| 509 | $doc->text_class(qw/news sports/); |
| 510 | print $doc->text_class->[0]; |
| 511 | |
| 512 | Set the text class as an array or get an array |
| 513 | reference of text classes. |
| 514 | |
| 515 | |
| 516 | =head1 METHODS |
| 517 | |
| 518 | =head2 parse |
| 519 | |
| 520 | $doc->parse; |
| 521 | |
| 522 | Run the parsing process of the document |
| 523 | |
| 524 | |
| 525 | =cut |