Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 1 | package KorAP::Document; |
| 2 | use Mojo::Base -base; |
| 3 | use v5.16; |
| 4 | |
| 5 | use Mojo::ByteStream 'b'; |
| 6 | use Mojo::DOM; |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 7 | use Carp qw/croak/; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 8 | use KorAP::Document::Primary; |
| 9 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 10 | our @ATTR = qw/id corpus_id pub_date |
| 11 | title sub_title pub_place/; |
| 12 | has 'path'; |
| 13 | has [@ATTR]; |
| 14 | |
| 15 | has log => sub { Log::Log4perl->get_logger(__PACKAGE__) }; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 16 | |
| 17 | # parse document |
| 18 | sub parse { |
| 19 | my $self = shift; |
| 20 | my $file = b($self->path . 'data.xml')->slurp; |
| 21 | |
Nils Diewald | 3ece630 | 2013-12-02 18:38:16 +0000 | [diff] [blame^] | 22 | state $unable = 'Unable to parse document ' . $self->path; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 23 | |
Nils Diewald | 3ece630 | 2013-12-02 18:38:16 +0000 | [diff] [blame^] | 24 | $self->log->debug('Parse document ' . $self->path); |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 25 | |
| 26 | my $dom = Mojo::DOM->new($file); |
| 27 | |
| 28 | my $rt = $dom->at('raw_text'); |
| 29 | |
| 30 | # Get document id and corpus id |
| 31 | if ($rt && $rt->attr('docid')) { |
| 32 | $self->id($rt->attr('docid')); |
| 33 | if ($self->id =~ /^([^_]+)_/) { |
| 34 | $self->corpus_id($1); |
| 35 | } |
| 36 | else { |
Nils Diewald | 3ece630 | 2013-12-02 18:38:16 +0000 | [diff] [blame^] | 37 | croak $unable . ': ID not parseable'; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 38 | }; |
| 39 | } |
| 40 | else { |
Nils Diewald | 3ece630 | 2013-12-02 18:38:16 +0000 | [diff] [blame^] | 41 | croak $unable . ': No raw_text found or no ID'; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 42 | }; |
| 43 | |
| 44 | # Get primary data |
| 45 | my $pd = $rt->at('text'); |
| 46 | if ($pd) { |
| 47 | |
| 48 | $pd = b($pd->text)->decode; |
| 49 | $self->{pd} = KorAP::Document::Primary->new($pd->to_string); |
| 50 | } |
| 51 | else { |
| 52 | croak $unable; |
| 53 | }; |
| 54 | |
| 55 | # Get meta data |
| 56 | $self->_parse_meta; |
| 57 | return 1; |
| 58 | }; |
| 59 | |
| 60 | |
| 61 | # Primary data |
| 62 | sub primary { |
| 63 | $_[0]->{pd}; |
| 64 | }; |
| 65 | |
| 66 | sub author { |
| 67 | my $self = shift; |
| 68 | |
| 69 | # Set authors |
| 70 | if ($_[0]) { |
| 71 | return $self->{authors} = [ |
| 72 | grep { $_ !~ m{^\s*u\.a\.\s*$} } split(/;\s+/, shift()) |
| 73 | ]; |
| 74 | } |
| 75 | return ($self->{authors} // []); |
| 76 | }; |
| 77 | |
| 78 | sub text_class { |
| 79 | my $self = shift; |
| 80 | if ($_[0]) { |
| 81 | return $self->{topics} = [ @_ ]; |
| 82 | }; |
| 83 | return ($self->{topics} // []); |
| 84 | }; |
| 85 | |
| 86 | |
| 87 | |
| 88 | sub _parse_meta { |
| 89 | my $self = shift; |
| 90 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 91 | my $file = b($self->path . 'header.xml')->slurp->decode('iso-8859-1'); |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 92 | |
| 93 | my $dom = Mojo::DOM->new($file); |
Nils Diewald | 682feb0 | 2013-11-29 22:48:40 +0000 | [diff] [blame] | 94 | my $analytic = $dom->at('analytic'); |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 95 | |
| 96 | # Get title |
Nils Diewald | 682feb0 | 2013-11-29 22:48:40 +0000 | [diff] [blame] | 97 | my $title = $analytic->at('h\.title[type=main]'); |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 98 | $self->title($title->text) if $title; |
| 99 | |
| 100 | # Get Subtitle |
Nils Diewald | 682feb0 | 2013-11-29 22:48:40 +0000 | [diff] [blame] | 101 | my $sub_title = $analytic->at('h\.title[type=sub]'); |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 102 | $self->sub_title($sub_title->text) if $sub_title; |
| 103 | |
| 104 | # Get Author |
Nils Diewald | 682feb0 | 2013-11-29 22:48:40 +0000 | [diff] [blame] | 105 | my $author = $analytic->at('h\.author'); |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 106 | $self->author($author->all_text) if $author; |
| 107 | |
| 108 | # Get pubDate |
| 109 | my $year = $dom->at("pubDate[type=year]"); |
| 110 | $year = $year ? $year->text : 0; |
| 111 | my $month = $dom->at("pubDate[type=month]"); |
| 112 | $month = $month ? $month->text : 0; |
| 113 | my $day = $dom->at("pubDate[type=day]"); |
| 114 | $day = $day ? $day->text : 0; |
| 115 | |
Nils Diewald | 092178e | 2013-11-26 16:18:48 +0000 | [diff] [blame] | 116 | $year = 0 if $year !~ /^\d+$/; |
| 117 | $month = 0 if $month !~ /^\d+$/; |
| 118 | $day = 0 if $day !~ /^\d+$/; |
| 119 | |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 120 | my $date = $year ? ($year < 100 ? '20' . $year : $year) : '0000'; |
| 121 | $date .= length($month) == 1 ? '0' . $month : $month; |
| 122 | $date .= length($day) == 1 ? '0' . $day : $day; |
| 123 | |
| 124 | $self->pub_date($date); |
| 125 | |
| 126 | # Get textClasses |
| 127 | my @topic; |
| 128 | $dom->find("textClass catRef")->each( |
| 129 | sub { |
| 130 | my ($ign, @ttopic) = split('\.', $_->attr('target')); |
| 131 | push(@topic, @ttopic); |
| 132 | } |
| 133 | ); |
| 134 | $self->text_class(@topic); |
| 135 | }; |
| 136 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 137 | sub to_string { |
| 138 | my $self = shift; |
| 139 | |
| 140 | my $string; |
| 141 | |
| 142 | foreach (@ATTR) { |
| 143 | if (my $att = $self->$_) { |
| 144 | $att =~ s/\n/ /g; |
| 145 | $att =~ s/\s\s+/ /g; |
| 146 | $string .= $_ . ' = ' . $att . "\n"; |
| 147 | }; |
| 148 | }; |
| 149 | |
| 150 | if ($self->author) { |
| 151 | foreach (@{$self->author}) { |
| 152 | $_ =~ s/\n/ /g; |
| 153 | $_ =~ s/\s\s+/ /g; |
| 154 | $string .= 'author = ' . $_ . "\n"; |
| 155 | }; |
| 156 | }; |
| 157 | |
| 158 | if ($self->text_class) { |
| 159 | foreach (@{$self->text_class}) { |
| 160 | $string .= 'text_class = ' . $_ . "\n"; |
| 161 | }; |
| 162 | }; |
| 163 | |
| 164 | return $string; |
| 165 | }; |
| 166 | |
Nils Diewald | 044c41d | 2013-11-11 21:45:09 +0000 | [diff] [blame] | 167 | sub _k { |
| 168 | my $x = $_[0]; |
| 169 | $x =~ s/_(\w)/\U$1\E/g; |
| 170 | $x =~ s/id$/ID/gi; |
| 171 | return $x; |
| 172 | }; |
| 173 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 174 | |
| 175 | sub to_hash { |
| 176 | my $self = shift; |
| 177 | |
| 178 | my %hash; |
| 179 | |
Nils Diewald | 044c41d | 2013-11-11 21:45:09 +0000 | [diff] [blame] | 180 | foreach (@ATTR) { |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 181 | if (my $att = $self->$_) { |
| 182 | $att =~ s/\n/ /g; |
| 183 | $att =~ s/\s\s+/ /g; |
Nils Diewald | 044c41d | 2013-11-11 21:45:09 +0000 | [diff] [blame] | 184 | $hash{_k($_)} = $att; |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 185 | }; |
| 186 | }; |
| 187 | |
Nils Diewald | 37e5b57 | 2013-11-20 20:26:03 +0000 | [diff] [blame] | 188 | for ('author') { |
Nils Diewald | 044c41d | 2013-11-11 21:45:09 +0000 | [diff] [blame] | 189 | $hash{_k($_)} = join(',', @{ $self->$_ }); |
| 190 | }; |
| 191 | |
Nils Diewald | 37e5b57 | 2013-11-20 20:26:03 +0000 | [diff] [blame] | 192 | for ('text_class') { |
| 193 | $hash{_k($_)} = join(' ', @{ $self->$_ }); |
| 194 | }; |
| 195 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 196 | return \%hash; |
| 197 | }; |
| 198 | |
| 199 | |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 200 | 1; |
| 201 | |
| 202 | |
| 203 | __END__ |
| 204 | |
| 205 | =pod |
| 206 | |
| 207 | =head1 NAME |
| 208 | |
| 209 | KorAP::Document |
| 210 | |
| 211 | |
| 212 | =head1 SYNOPSIS |
| 213 | |
| 214 | my $doc = KorAP::Document->new( |
| 215 | path => 'mydoc-1/' |
| 216 | ); |
| 217 | |
| 218 | $doc->parse; |
| 219 | |
| 220 | print $doc->title; |
| 221 | |
| 222 | |
| 223 | =head1 DESCRIPTION |
| 224 | |
| 225 | Parse the primary and meta data of a document. |
| 226 | |
| 227 | |
| 228 | =head2 ATTRIBUTES |
| 229 | |
| 230 | =head2 id |
| 231 | |
| 232 | $doc->id(75476); |
| 233 | print $doc->id; |
| 234 | |
| 235 | The unique identifier of the document. |
| 236 | |
| 237 | |
| 238 | =head2 corpus_id |
| 239 | |
| 240 | $doc->corpus_id(4); |
| 241 | print $doc->corpus_id; |
| 242 | |
| 243 | The unique identifier of the corpus. |
| 244 | |
| 245 | |
| 246 | =head2 path |
| 247 | |
| 248 | $doc->path("example-004/"); |
| 249 | print $doc->path; |
| 250 | |
| 251 | The path of the document. |
| 252 | |
| 253 | |
| 254 | =head2 title |
| 255 | |
| 256 | $doc->title("Der Name der Rose"); |
| 257 | print $doc->title; |
| 258 | |
| 259 | The title of the document. |
| 260 | |
| 261 | |
| 262 | =head2 sub_title |
| 263 | |
| 264 | $doc->sub_title("Natürlich eine Handschrift"); |
| 265 | print $doc->sub_title; |
| 266 | |
| 267 | The title of the document. |
| 268 | |
| 269 | |
| 270 | =head2 pub_place |
| 271 | |
| 272 | $doc->pub_place("Rom"); |
| 273 | print $doc->pub_place; |
| 274 | |
| 275 | The publication place of the document. |
| 276 | |
| 277 | |
| 278 | =head2 pub_date |
| 279 | |
| 280 | $doc->pub_place("19800404"); |
| 281 | print $doc->pub_place; |
| 282 | |
| 283 | The publication date of the document, |
| 284 | in the format "YYYYMMDD". |
| 285 | |
| 286 | |
| 287 | =head2 primary |
| 288 | |
| 289 | print $doc->primary->data(0,20); |
| 290 | |
| 291 | The L<KorAP::Document::Primary> object containing the primary data. |
| 292 | |
| 293 | |
| 294 | =head2 author |
| 295 | |
| 296 | $doc->author('Binks, Jar Jar; Luke Skywalker'); |
| 297 | print $doc->author->[0]; |
| 298 | |
| 299 | Set the author value as semikolon separated list of names or |
| 300 | get an array reference of author names. |
| 301 | |
| 302 | =head2 text_class |
| 303 | |
| 304 | $doc->text_class(qw/news sports/); |
| 305 | print $doc->text_class->[0]; |
| 306 | |
| 307 | Set the text class as an array or get an array |
| 308 | reference of text classes. |
| 309 | |
| 310 | |
| 311 | =head1 METHODS |
| 312 | |
| 313 | =head2 parse |
| 314 | |
| 315 | $doc->parse; |
| 316 | |
| 317 | Run the parsing process of the document |
| 318 | |
| 319 | |
| 320 | =cut |