blob: 5b4ef73907e88d529685cbd9b84f283addf54e00 [file] [log] [blame]
Nils Diewald2db9ad02013-10-29 19:26:43 +00001package KorAP::Document;
2use Mojo::Base -base;
3use v5.16;
4
5use Mojo::ByteStream 'b';
6use Mojo::DOM;
Nils Diewald7364d1f2013-11-05 19:26:35 +00007use Carp qw/croak/;
Nils Diewald2db9ad02013-10-29 19:26:43 +00008use KorAP::Document::Primary;
9
Nils Diewald7364d1f2013-11-05 19:26:35 +000010our @ATTR = qw/id corpus_id pub_date
11 title sub_title pub_place/;
12has 'path';
13has [@ATTR];
14
15has log => sub { Log::Log4perl->get_logger(__PACKAGE__) };
Nils Diewald2db9ad02013-10-29 19:26:43 +000016
17# parse document
18sub parse {
19 my $self = shift;
20 my $file = b($self->path . 'data.xml')->slurp;
21
Nils Diewald3ece6302013-12-02 18:38:16 +000022 state $unable = 'Unable to parse document ' . $self->path;
Nils Diewald2db9ad02013-10-29 19:26:43 +000023
Nils Diewald3ece6302013-12-02 18:38:16 +000024 $self->log->debug('Parse document ' . $self->path);
Nils Diewald2db9ad02013-10-29 19:26:43 +000025
26 my $dom = Mojo::DOM->new($file);
27
28 my $rt = $dom->at('raw_text');
29
30 # Get document id and corpus id
31 if ($rt && $rt->attr('docid')) {
32 $self->id($rt->attr('docid'));
33 if ($self->id =~ /^([^_]+)_/) {
34 $self->corpus_id($1);
35 }
36 else {
Nils Diewald3ece6302013-12-02 18:38:16 +000037 croak $unable . ': ID not parseable';
Nils Diewald2db9ad02013-10-29 19:26:43 +000038 };
39 }
40 else {
Nils Diewald3ece6302013-12-02 18:38:16 +000041 croak $unable . ': No raw_text found or no ID';
Nils Diewald2db9ad02013-10-29 19:26:43 +000042 };
43
44 # Get primary data
45 my $pd = $rt->at('text');
46 if ($pd) {
47
48 $pd = b($pd->text)->decode;
49 $self->{pd} = KorAP::Document::Primary->new($pd->to_string);
50 }
51 else {
52 croak $unable;
53 };
54
55 # Get meta data
56 $self->_parse_meta;
57 return 1;
58};
59
60
61# Primary data
62sub primary {
63 $_[0]->{pd};
64};
65
66sub author {
67 my $self = shift;
68
69 # Set authors
70 if ($_[0]) {
71 return $self->{authors} = [
72 grep { $_ !~ m{^\s*u\.a\.\s*$} } split(/;\s+/, shift())
73 ];
74 }
75 return ($self->{authors} // []);
76};
77
78sub text_class {
79 my $self = shift;
80 if ($_[0]) {
81 return $self->{topics} = [ @_ ];
82 };
83 return ($self->{topics} // []);
84};
85
86
87
88sub _parse_meta {
89 my $self = shift;
90
Nils Diewald7364d1f2013-11-05 19:26:35 +000091 my $file = b($self->path . 'header.xml')->slurp->decode('iso-8859-1');
Nils Diewald2db9ad02013-10-29 19:26:43 +000092
93 my $dom = Mojo::DOM->new($file);
Nils Diewald682feb02013-11-29 22:48:40 +000094 my $analytic = $dom->at('analytic');
Nils Diewald2db9ad02013-10-29 19:26:43 +000095
96 # Get title
Nils Diewald682feb02013-11-29 22:48:40 +000097 my $title = $analytic->at('h\.title[type=main]');
Nils Diewald2db9ad02013-10-29 19:26:43 +000098 $self->title($title->text) if $title;
99
100 # Get Subtitle
Nils Diewald682feb02013-11-29 22:48:40 +0000101 my $sub_title = $analytic->at('h\.title[type=sub]');
Nils Diewald2db9ad02013-10-29 19:26:43 +0000102 $self->sub_title($sub_title->text) if $sub_title;
103
104 # Get Author
Nils Diewald682feb02013-11-29 22:48:40 +0000105 my $author = $analytic->at('h\.author');
Nils Diewald2db9ad02013-10-29 19:26:43 +0000106 $self->author($author->all_text) if $author;
107
108 # Get pubDate
109 my $year = $dom->at("pubDate[type=year]");
110 $year = $year ? $year->text : 0;
111 my $month = $dom->at("pubDate[type=month]");
112 $month = $month ? $month->text : 0;
113 my $day = $dom->at("pubDate[type=day]");
114 $day = $day ? $day->text : 0;
115
Nils Diewald092178e2013-11-26 16:18:48 +0000116 $year = 0 if $year !~ /^\d+$/;
117 $month = 0 if $month !~ /^\d+$/;
118 $day = 0 if $day !~ /^\d+$/;
119
Nils Diewald2db9ad02013-10-29 19:26:43 +0000120 my $date = $year ? ($year < 100 ? '20' . $year : $year) : '0000';
121 $date .= length($month) == 1 ? '0' . $month : $month;
122 $date .= length($day) == 1 ? '0' . $day : $day;
123
124 $self->pub_date($date);
125
126 # Get textClasses
127 my @topic;
128 $dom->find("textClass catRef")->each(
129 sub {
130 my ($ign, @ttopic) = split('\.', $_->attr('target'));
131 push(@topic, @ttopic);
132 }
133 );
134 $self->text_class(@topic);
135};
136
Nils Diewald7364d1f2013-11-05 19:26:35 +0000137sub to_string {
138 my $self = shift;
139
140 my $string;
141
142 foreach (@ATTR) {
143 if (my $att = $self->$_) {
144 $att =~ s/\n/ /g;
145 $att =~ s/\s\s+/ /g;
146 $string .= $_ . ' = ' . $att . "\n";
147 };
148 };
149
150 if ($self->author) {
151 foreach (@{$self->author}) {
152 $_ =~ s/\n/ /g;
153 $_ =~ s/\s\s+/ /g;
154 $string .= 'author = ' . $_ . "\n";
155 };
156 };
157
158 if ($self->text_class) {
159 foreach (@{$self->text_class}) {
160 $string .= 'text_class = ' . $_ . "\n";
161 };
162 };
163
164 return $string;
165};
166
Nils Diewald044c41d2013-11-11 21:45:09 +0000167sub _k {
168 my $x = $_[0];
169 $x =~ s/_(\w)/\U$1\E/g;
170 $x =~ s/id$/ID/gi;
171 return $x;
172};
173
Nils Diewald7364d1f2013-11-05 19:26:35 +0000174
175sub to_hash {
176 my $self = shift;
177
178 my %hash;
179
Nils Diewald044c41d2013-11-11 21:45:09 +0000180 foreach (@ATTR) {
Nils Diewald7364d1f2013-11-05 19:26:35 +0000181 if (my $att = $self->$_) {
182 $att =~ s/\n/ /g;
183 $att =~ s/\s\s+/ /g;
Nils Diewald044c41d2013-11-11 21:45:09 +0000184 $hash{_k($_)} = $att;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000185 };
186 };
187
Nils Diewald37e5b572013-11-20 20:26:03 +0000188 for ('author') {
Nils Diewald044c41d2013-11-11 21:45:09 +0000189 $hash{_k($_)} = join(',', @{ $self->$_ });
190 };
191
Nils Diewald37e5b572013-11-20 20:26:03 +0000192 for ('text_class') {
193 $hash{_k($_)} = join(' ', @{ $self->$_ });
194 };
195
Nils Diewald7364d1f2013-11-05 19:26:35 +0000196 return \%hash;
197};
198
199
Nils Diewald2db9ad02013-10-29 19:26:43 +00002001;
201
202
203__END__
204
205=pod
206
207=head1 NAME
208
209KorAP::Document
210
211
212=head1 SYNOPSIS
213
214 my $doc = KorAP::Document->new(
215 path => 'mydoc-1/'
216 );
217
218 $doc->parse;
219
220 print $doc->title;
221
222
223=head1 DESCRIPTION
224
225Parse the primary and meta data of a document.
226
227
228=head2 ATTRIBUTES
229
230=head2 id
231
232 $doc->id(75476);
233 print $doc->id;
234
235The unique identifier of the document.
236
237
238=head2 corpus_id
239
240 $doc->corpus_id(4);
241 print $doc->corpus_id;
242
243The unique identifier of the corpus.
244
245
246=head2 path
247
248 $doc->path("example-004/");
249 print $doc->path;
250
251The path of the document.
252
253
254=head2 title
255
256 $doc->title("Der Name der Rose");
257 print $doc->title;
258
259The title of the document.
260
261
262=head2 sub_title
263
264 $doc->sub_title("Natürlich eine Handschrift");
265 print $doc->sub_title;
266
267The title of the document.
268
269
270=head2 pub_place
271
272 $doc->pub_place("Rom");
273 print $doc->pub_place;
274
275The publication place of the document.
276
277
278=head2 pub_date
279
280 $doc->pub_place("19800404");
281 print $doc->pub_place;
282
283The publication date of the document,
284in the format "YYYYMMDD".
285
286
287=head2 primary
288
289 print $doc->primary->data(0,20);
290
291The L<KorAP::Document::Primary> object containing the primary data.
292
293
294=head2 author
295
296 $doc->author('Binks, Jar Jar; Luke Skywalker');
297 print $doc->author->[0];
298
299Set the author value as semikolon separated list of names or
300get an array reference of author names.
301
302=head2 text_class
303
304 $doc->text_class(qw/news sports/);
305 print $doc->text_class->[0];
306
307Set the text class as an array or get an array
308reference of text classes.
309
310
311=head1 METHODS
312
313=head2 parse
314
315 $doc->parse;
316
317Run the parsing process of the document
318
319
320=cut