| Akron | 35db6e3 | 2016-03-17 22:42:22 +0100 | [diff] [blame] | 1 | package KorAP::XML::Meta::I5; |
| 2 | use KorAP::XML::Meta::Base; |
| Akron | fbf6638 | 2016-07-12 19:44:01 +0200 | [diff] [blame] | 3 | |
| 4 | our $SIGLE_RE = qr/^([^_\/]+)(?:[_\/]([^\._\/]+?)(?:\.(.+?))?)?$/; |
| Akron | 35db6e3 | 2016-03-17 22:42:22 +0100 | [diff] [blame] | 5 | |
| Akron | afb81ad | 2016-08-01 20:28:31 +0200 | [diff] [blame] | 6 | sub _squish ($) { |
| 7 | for ($_[0]) { |
| 8 | s!\s\s+! !g; |
| 9 | s!^\s*!!; |
| 10 | s!\s*$!!; |
| Akron | c388150 | 2017-01-20 14:36:37 +0100 | [diff] [blame] | 11 | s!^\-+$!!g; |
| Akron | afb81ad | 2016-08-01 20:28:31 +0200 | [diff] [blame] | 12 | }; |
| 13 | $_[0]; |
| 14 | }; |
| 15 | |
| Akron | 35db6e3 | 2016-03-17 22:42:22 +0100 | [diff] [blame] | 16 | # Parse meta data |
| 17 | sub parse { |
| Akron | 6396c30 | 2016-03-18 16:05:39 +0100 | [diff] [blame] | 18 | my ($self, $dom, $type) = @_; |
| Akron | 35db6e3 | 2016-03-17 22:42:22 +0100 | [diff] [blame] | 19 | |
| Akron | fbf6638 | 2016-07-12 19:44:01 +0200 | [diff] [blame] | 20 | # Parse text sigle |
| 21 | if ($type eq 'text' && !$self->text_sigle) { |
| 22 | my $v = $dom->at('textSigle'); |
| 23 | if ($v) { |
| Akron | afb81ad | 2016-08-01 20:28:31 +0200 | [diff] [blame] | 24 | $self->{_text_sigle} = _squish $v->text; |
| Akron | fbf6638 | 2016-07-12 19:44:01 +0200 | [diff] [blame] | 25 | if ($self->{_text_sigle} =~ $SIGLE_RE) { |
| Akron | 3ec0a1c | 2017-01-18 14:41:55 +0100 | [diff] [blame] | 26 | $self->{_text_sigle} = join('/', $1, $2, $3); |
| 27 | $self->{_doc_sigle} = join('/', $1, $2); |
| 28 | $self->{_corpus_sigle} = $1; |
| Akron | fbf6638 | 2016-07-12 19:44:01 +0200 | [diff] [blame] | 29 | }; |
| 30 | } |
| 31 | } |
| 32 | |
| 33 | # Parse document sigle |
| 34 | elsif ($type eq 'doc' && !$self->doc_sigle) { |
| 35 | my $v = $dom->at('dokumentSigle'); |
| 36 | if ($v) { |
| 37 | $self->{_doc_sigle} = $v->text; |
| 38 | if ($self->{_doc_sigle} =~ $SIGLE_RE) { |
| Akron | 3ec0a1c | 2017-01-18 14:41:55 +0100 | [diff] [blame] | 39 | $self->{_doc_sigle} = join('/', $1, $2); |
| 40 | $self->{_corpus_sigle} = $1; |
| Akron | fbf6638 | 2016-07-12 19:44:01 +0200 | [diff] [blame] | 41 | }; |
| 42 | } |
| 43 | } |
| 44 | |
| 45 | # Parse corpus sigle |
| 46 | elsif ($type eq 'corpus' && !$self->corpus_sigle) { |
| 47 | my $v = $dom->at('korpusSigle'); |
| 48 | $self->{_corpus_sigle} = $v->text if $v; |
| 49 | }; |
| 50 | |
| Akron | 7e2eb88 | 2017-01-18 17:28:07 +0100 | [diff] [blame] | 51 | # TODO: May have analytic AND monogr |
| 52 | foreach my $analytic ($dom->at('analytic'), $dom->at('monogr')) { |
| 53 | next unless $analytic; |
| 54 | # There is an analytic element |
| Akron | 35db6e3 | 2016-03-17 22:42:22 +0100 | [diff] [blame] | 55 | |
| 56 | # Get title, subtitle, author, editor |
| 57 | my $title = $analytic->at('h\.title[type=main]'); |
| 58 | my $sub_title = $analytic->at('h\.title[type=sub]'); |
| 59 | my $author = $analytic->at('h\.author'); |
| 60 | my $editor = $analytic->at('editor'); |
| 61 | |
| Akron | 578af4b | 2017-01-20 16:28:50 +0100 | [diff] [blame] | 62 | # Editor contains translator |
| 63 | my $translator; |
| 64 | if ($editor && $editor->attr('role') && $editor->attr('role') eq 'translator') { |
| 65 | # Translator is only supported on the text level currently |
| Akron | a7d0e9f | 2017-02-03 14:36:21 +0100 | [diff] [blame] | 66 | $translator = _squish $editor->all_text; |
| 67 | $self->{translator} = $translator if $translator; |
| Akron | 578af4b | 2017-01-20 16:28:50 +0100 | [diff] [blame] | 68 | $editor = undef; |
| 69 | } |
| 70 | else { |
| 71 | $editor = $editor ? _squish $editor->all_text : undef; |
| 72 | }; |
| 73 | |
| Akron | afb81ad | 2016-08-01 20:28:31 +0200 | [diff] [blame] | 74 | $title = $title ? _squish $title->all_text : undef; |
| 75 | $sub_title = $sub_title ? _squish $sub_title->all_text : undef; |
| 76 | $author = $author ? _squish $author->all_text : undef; |
| Akron | 35db6e3 | 2016-03-17 22:42:22 +0100 | [diff] [blame] | 77 | |
| Akron | 6396c30 | 2016-03-18 16:05:39 +0100 | [diff] [blame] | 78 | # Text meta data |
| Akron | 35db6e3 | 2016-03-17 22:42:22 +0100 | [diff] [blame] | 79 | if ($type eq 'text') { |
| Akron | 7e2eb88 | 2017-01-18 17:28:07 +0100 | [diff] [blame] | 80 | unless ($self->{title} || $self->{sub_title}) { |
| 81 | $self->{title} = _remove_prefix($title, $self->text_sigle) if $title; |
| 82 | $self->{sub_title} = $sub_title if $sub_title; |
| 83 | }; |
| 84 | $self->{editor} //= $editor if $editor; |
| 85 | $self->{author} //= $author if $author; |
| Akron | 35db6e3 | 2016-03-17 22:42:22 +0100 | [diff] [blame] | 86 | } |
| Akron | 6396c30 | 2016-03-18 16:05:39 +0100 | [diff] [blame] | 87 | |
| 88 | # Doc meta data |
| Akron | 35db6e3 | 2016-03-17 22:42:22 +0100 | [diff] [blame] | 89 | elsif ($type eq 'doc') { |
| Akron | 7e2eb88 | 2017-01-18 17:28:07 +0100 | [diff] [blame] | 90 | unless ($self->{doc_title} || $self->{doc_sub_title}) { |
| 91 | $self->{doc_title} //= _remove_prefix($title, $self->doc_sigle) if $title; |
| 92 | $self->{doc_sub_title} //= $sub_title if $sub_title; |
| 93 | }; |
| 94 | $self->{doc_author} //= $author if $author; |
| 95 | $self->{doc_editor} //= $editor if $editor; |
| Akron | 35db6e3 | 2016-03-17 22:42:22 +0100 | [diff] [blame] | 96 | } |
| Akron | 6396c30 | 2016-03-18 16:05:39 +0100 | [diff] [blame] | 97 | |
| 98 | # Corpus meta data |
| Akron | 35db6e3 | 2016-03-17 22:42:22 +0100 | [diff] [blame] | 99 | elsif ($type eq 'corpus') { |
| Akron | 7e2eb88 | 2017-01-18 17:28:07 +0100 | [diff] [blame] | 100 | unless ($self->{corpus_title} || $self->{corpus_sub_title}) { |
| 101 | $self->{corpus_title} //= _remove_prefix($title, $self->corpus_sigle) if $title; |
| 102 | $self->{corpus_sub_title} //= $sub_title if $sub_title; |
| 103 | }; |
| 104 | $self->{corpus_author} //= $author if $author; |
| 105 | $self->{corpus_editor} //= $editor if $editor; |
| Akron | 35db6e3 | 2016-03-17 22:42:22 +0100 | [diff] [blame] | 106 | }; |
| 107 | }; |
| 108 | |
| 109 | # Not in analytic |
| Akron | 6396c30 | 2016-03-18 16:05:39 +0100 | [diff] [blame] | 110 | my $title; |
| Akron | 35db6e3 | 2016-03-17 22:42:22 +0100 | [diff] [blame] | 111 | if ($type eq 'corpus') { |
| Akron | 6396c30 | 2016-03-18 16:05:39 +0100 | [diff] [blame] | 112 | |
| 113 | # Corpus title not yet given |
| Akron | 35db6e3 | 2016-03-17 22:42:22 +0100 | [diff] [blame] | 114 | unless ($self->{corpus_title}) { |
| Akron | 6396c30 | 2016-03-18 16:05:39 +0100 | [diff] [blame] | 115 | if ($title = $dom->at('fileDesc > titleStmt > c\.title')) { |
| Akron | a7d0e9f | 2017-02-03 14:36:21 +0100 | [diff] [blame] | 116 | $title = _squish($title->all_text); |
| Akron | 6396c30 | 2016-03-18 16:05:39 +0100 | [diff] [blame] | 117 | |
| Akron | 3ec0a1c | 2017-01-18 14:41:55 +0100 | [diff] [blame] | 118 | if ($title) { |
| 119 | $self->{corpus_title} = _remove_prefix($title, $self->corpus_sigle); |
| 120 | }; |
| Akron | 35db6e3 | 2016-03-17 22:42:22 +0100 | [diff] [blame] | 121 | }; |
| 122 | }; |
| 123 | } |
| 124 | |
| 125 | # doc title |
| 126 | elsif ($type eq 'doc') { |
| 127 | unless ($self->{doc_title}) { |
| Akron | 6396c30 | 2016-03-18 16:05:39 +0100 | [diff] [blame] | 128 | if ($title = $dom->at('fileDesc > titleStmt > d\.title')) { |
| Akron | a7d0e9f | 2017-02-03 14:36:21 +0100 | [diff] [blame] | 129 | $title = _squish($title->all_text); |
| Akron | 6396c30 | 2016-03-18 16:05:39 +0100 | [diff] [blame] | 130 | |
| Akron | 3ec0a1c | 2017-01-18 14:41:55 +0100 | [diff] [blame] | 131 | if ($title) { |
| 132 | $self->{doc_title} = _remove_prefix($title, $self->doc_sigle); |
| 133 | }; |
| Akron | 35db6e3 | 2016-03-17 22:42:22 +0100 | [diff] [blame] | 134 | }; |
| 135 | }; |
| 136 | } |
| 137 | |
| 138 | # text title |
| 139 | elsif ($type eq 'text') { |
| 140 | unless ($self->{title}) { |
| Akron | 6396c30 | 2016-03-18 16:05:39 +0100 | [diff] [blame] | 141 | if ($title = $dom->at('fileDesc > titleStmt > t\.title')) { |
| Akron | a7d0e9f | 2017-02-03 14:36:21 +0100 | [diff] [blame] | 142 | $title = _squish($title->all_text); |
| Akron | 3ec0a1c | 2017-01-18 14:41:55 +0100 | [diff] [blame] | 143 | if ($title) { |
| 144 | $self->{title} = _remove_prefix($title, $self->text_sigle); |
| 145 | }; |
| Akron | 35db6e3 | 2016-03-17 22:42:22 +0100 | [diff] [blame] | 146 | } |
| 147 | }; |
| 148 | }; |
| 149 | |
| Akron | 6396c30 | 2016-03-18 16:05:39 +0100 | [diff] [blame] | 150 | my $temp; |
| 151 | |
| Akron | 35db6e3 | 2016-03-17 22:42:22 +0100 | [diff] [blame] | 152 | # Get PubPlace |
| Akron | 6396c30 | 2016-03-18 16:05:39 +0100 | [diff] [blame] | 153 | if ($temp = $dom->at('pubPlace')) { |
| 154 | my $place_attr = $temp->attr('key'); |
| 155 | $self->{pub_place_key} = $place_attr if $place_attr; |
| Akron | afb81ad | 2016-08-01 20:28:31 +0200 | [diff] [blame] | 156 | $temp = _squish $temp->all_text; |
| Akron | 6396c30 | 2016-03-18 16:05:39 +0100 | [diff] [blame] | 157 | $self->{pub_place} = $temp if $temp; |
| Akron | 35db6e3 | 2016-03-17 22:42:22 +0100 | [diff] [blame] | 158 | }; |
| 159 | |
| 160 | # Get Publisher |
| Akron | 6396c30 | 2016-03-18 16:05:39 +0100 | [diff] [blame] | 161 | if ($temp = $dom->at('imprint publisher')) { |
| Akron | afb81ad | 2016-08-01 20:28:31 +0200 | [diff] [blame] | 162 | $temp = _squish $temp->all_text; |
| Akron | 6396c30 | 2016-03-18 16:05:39 +0100 | [diff] [blame] | 163 | $self->{publisher} = $temp if $temp; |
| Akron | 35db6e3 | 2016-03-17 22:42:22 +0100 | [diff] [blame] | 164 | }; |
| 165 | |
| 166 | # Get text type |
| Akron | 6396c30 | 2016-03-18 16:05:39 +0100 | [diff] [blame] | 167 | $temp = $dom->at('textDesc'); |
| 168 | my $temp_2; |
| Akron | 35db6e3 | 2016-03-17 22:42:22 +0100 | [diff] [blame] | 169 | |
| Akron | 6396c30 | 2016-03-18 16:05:39 +0100 | [diff] [blame] | 170 | if ($temp) { |
| 171 | if ($temp_2 = $temp->at('textType')) { |
| Akron | afb81ad | 2016-08-01 20:28:31 +0200 | [diff] [blame] | 172 | $temp_2 = _squish $temp_2->all_text; |
| Akron | 6396c30 | 2016-03-18 16:05:39 +0100 | [diff] [blame] | 173 | $self->{text_type} = $temp_2 if $temp_2; |
| Akron | 35db6e3 | 2016-03-17 22:42:22 +0100 | [diff] [blame] | 174 | }; |
| 175 | |
| 176 | # Get text domain |
| Akron | 6396c30 | 2016-03-18 16:05:39 +0100 | [diff] [blame] | 177 | if ($temp_2 = $temp->at('textDomain')) { |
| Akron | afb81ad | 2016-08-01 20:28:31 +0200 | [diff] [blame] | 178 | $temp_2 = _squish $temp_2->all_text; |
| Akron | 6396c30 | 2016-03-18 16:05:39 +0100 | [diff] [blame] | 179 | $self->{text_domain} = $temp_2 if $temp_2; |
| Akron | 35db6e3 | 2016-03-17 22:42:22 +0100 | [diff] [blame] | 180 | }; |
| 181 | |
| 182 | # Get text type art |
| Akron | 6396c30 | 2016-03-18 16:05:39 +0100 | [diff] [blame] | 183 | if ($temp_2 = $temp->at('textTypeArt')) { |
| Akron | afb81ad | 2016-08-01 20:28:31 +0200 | [diff] [blame] | 184 | $temp_2 = _squish $temp_2->all_text; |
| Akron | 6396c30 | 2016-03-18 16:05:39 +0100 | [diff] [blame] | 185 | $self->{text_type_art} = $temp_2 if $temp_2; |
| Akron | 35db6e3 | 2016-03-17 22:42:22 +0100 | [diff] [blame] | 186 | }; |
| 187 | |
| Akron | 6396c30 | 2016-03-18 16:05:39 +0100 | [diff] [blame] | 188 | # Get text type ref |
| 189 | if ($temp_2 = $temp->at('textTypeRef')) { |
| Akron | afb81ad | 2016-08-01 20:28:31 +0200 | [diff] [blame] | 190 | $temp_2 = _squish $temp_2->all_text; |
| Akron | 6396c30 | 2016-03-18 16:05:39 +0100 | [diff] [blame] | 191 | $self->{text_type_ref} = $temp_2 if $temp_2; |
| Akron | 35db6e3 | 2016-03-17 22:42:22 +0100 | [diff] [blame] | 192 | }; |
| 193 | }; |
| 194 | |
| Akron | 6396c30 | 2016-03-18 16:05:39 +0100 | [diff] [blame] | 195 | state $NR_RE = qr/^\d+$/; |
| 196 | state $REF_RE = qr!^[a-zA-Z0-9]+\/[a-zA-Z0-9]+\.\d+[\s:]\s*!; |
| Akron | 35db6e3 | 2016-03-17 22:42:22 +0100 | [diff] [blame] | 197 | |
| 198 | # Get pubDate |
| 199 | my $pub_date = $dom->find('pubDate[type=year]'); |
| 200 | $pub_date->each( |
| 201 | sub { |
| 202 | my $x = shift->parent; |
| Akron | 6396c30 | 2016-03-18 16:05:39 +0100 | [diff] [blame] | 203 | my $year = $x->at('pubDate[type=year]') or return; |
| Akron | 35db6e3 | 2016-03-17 22:42:22 +0100 | [diff] [blame] | 204 | $year = $year ? $year->text : 0; |
| Akron | 6396c30 | 2016-03-18 16:05:39 +0100 | [diff] [blame] | 205 | my $month = $x->at('pubDate[type=month]'); |
| Akron | 35db6e3 | 2016-03-17 22:42:22 +0100 | [diff] [blame] | 206 | $month = $month ? $month->text : 0; |
| Akron | 6396c30 | 2016-03-18 16:05:39 +0100 | [diff] [blame] | 207 | my $day = $x->at('pubDate[type=day]'); |
| Akron | 35db6e3 | 2016-03-17 22:42:22 +0100 | [diff] [blame] | 208 | $day = $day ? $day->text : 0; |
| 209 | |
| Akron | 6396c30 | 2016-03-18 16:05:39 +0100 | [diff] [blame] | 210 | $year = 0 if $year !~ $NR_RE; |
| 211 | $month = 0 if $month !~ $NR_RE; |
| 212 | $day = 0 if $day !~ $NR_RE; |
| Akron | 35db6e3 | 2016-03-17 22:42:22 +0100 | [diff] [blame] | 213 | |
| 214 | my $date = $year ? ($year < 100 ? '20' . $year : $year) : '0000'; |
| 215 | $date .= length($month) == 1 ? '0' . $month : $month; |
| 216 | $date .= length($day) == 1 ? '0' . $day : $day; |
| 217 | $self->{pub_date} = $date; |
| 218 | }); |
| 219 | |
| 220 | # creatDate |
| 221 | my $create_date = $dom->at('creatDate'); |
| 222 | if ($create_date && $create_date->text) { |
| Akron | afb81ad | 2016-08-01 20:28:31 +0200 | [diff] [blame] | 223 | $create_date = _squish $create_date->all_text; |
| Akron | 35db6e3 | 2016-03-17 22:42:22 +0100 | [diff] [blame] | 224 | if (index($create_date, '-') > -1) { |
| 225 | $self->log->warn("Creation date ranges are not supported"); |
| 226 | ($create_date) = split /\s*-\s*/, $create_date; |
| Akron | 6396c30 | 2016-03-18 16:05:39 +0100 | [diff] [blame] | 227 | }; |
| 228 | unless ($create_date =~ s{^(\d{4})$}{$1\.00\.00}) { |
| 229 | unless ($create_date =~ s{^(\d{4})\.(\d{2})$}{$1\.$2\.00}) { |
| Akron | 3139917 | 2017-01-20 14:13:34 +0100 | [diff] [blame] | 230 | $create_date =~ /^\d{4}\.\d{2}\.\d{2}$/; |
| Akron | 6396c30 | 2016-03-18 16:05:39 +0100 | [diff] [blame] | 231 | }; |
| 232 | }; |
| 233 | if ($create_date =~ /^\d{4}(?:\.\d{2}(?:\.\d{2})?)?$/) { |
| Akron | 35db6e3 | 2016-03-17 22:42:22 +0100 | [diff] [blame] | 234 | $create_date =~ tr/\.//d; |
| 235 | $self->{creation_date} = $create_date; |
| 236 | }; |
| 237 | }; |
| 238 | |
| Akron | 6396c30 | 2016-03-18 16:05:39 +0100 | [diff] [blame] | 239 | $temp = $dom->at('textClass'); |
| 240 | if ($temp) { |
| Akron | 35db6e3 | 2016-03-17 22:42:22 +0100 | [diff] [blame] | 241 | # Get textClasses |
| 242 | my @topic; |
| 243 | |
| Akron | 6396c30 | 2016-03-18 16:05:39 +0100 | [diff] [blame] | 244 | $temp->find("catRef")->each( |
| Akron | 35db6e3 | 2016-03-17 22:42:22 +0100 | [diff] [blame] | 245 | sub { |
| Akron | a7d0e9f | 2017-02-03 14:36:21 +0100 | [diff] [blame] | 246 | my ($ign, @ttopic) = grep { $_ } map { _squish($_) } split('\.', $_->attr('target')); |
| Akron | 3139917 | 2017-01-20 14:13:34 +0100 | [diff] [blame] | 247 | push(@topic, @ttopic); |
| Akron | 35db6e3 | 2016-03-17 22:42:22 +0100 | [diff] [blame] | 248 | } |
| 249 | ); |
| 250 | $self->{text_class} = [@topic] if @topic > 0; |
| 251 | |
| 252 | my $kws = $self->{keywords}; |
| Akron | a7d0e9f | 2017-02-03 14:36:21 +0100 | [diff] [blame] | 253 | my @keywords = $temp->find("h\.keywords > keyTerm")->map(sub {_squish($_) })->grep(sub { $_ })->each; |
| Akron | 35db6e3 | 2016-03-17 22:42:22 +0100 | [diff] [blame] | 254 | push(@$kws, @keywords) if @keywords > 0; |
| 255 | }; |
| 256 | |
| Akron | 6396c30 | 2016-03-18 16:05:39 +0100 | [diff] [blame] | 257 | if ($temp = $dom->at('biblFull editionStmt')) { |
| Akron | afb81ad | 2016-08-01 20:28:31 +0200 | [diff] [blame] | 258 | $temp = _squish $temp->all_text; |
| Akron | 6396c30 | 2016-03-18 16:05:39 +0100 | [diff] [blame] | 259 | $self->{bibl_edition_statement} = $temp if $temp; |
| Akron | 35db6e3 | 2016-03-17 22:42:22 +0100 | [diff] [blame] | 260 | }; |
| 261 | |
| Akron | 6396c30 | 2016-03-18 16:05:39 +0100 | [diff] [blame] | 262 | if ($temp = $dom->at('fileDescl editionStmt')) { |
| Akron | afb81ad | 2016-08-01 20:28:31 +0200 | [diff] [blame] | 263 | $temp = _squish $temp->all_text; |
| Akron | 6396c30 | 2016-03-18 16:05:39 +0100 | [diff] [blame] | 264 | $self->{file_edition_statement} = $temp if $temp; |
| Akron | 35db6e3 | 2016-03-17 22:42:22 +0100 | [diff] [blame] | 265 | }; |
| 266 | |
| Akron | 6396c30 | 2016-03-18 16:05:39 +0100 | [diff] [blame] | 267 | if ($temp = $dom->at('fileDesc')) { |
| 268 | if (my $availability = $temp->at('publicationStmt > availability')) { |
| Akron | afb81ad | 2016-08-01 20:28:31 +0200 | [diff] [blame] | 269 | $temp = _squish $availability->all_text; |
| Akron | 6396c30 | 2016-03-18 16:05:39 +0100 | [diff] [blame] | 270 | $self->{availability} = $temp if $temp; |
| Akron | 35db6e3 | 2016-03-17 22:42:22 +0100 | [diff] [blame] | 271 | }; |
| 272 | }; |
| 273 | |
| 274 | # Some meta data only available in the corpus |
| 275 | if ($type eq 'corpus') { |
| Akron | 6396c30 | 2016-03-18 16:05:39 +0100 | [diff] [blame] | 276 | if ($temp = $dom->at('profileDesc > langUsage > language[id]')) { |
| 277 | $self->{language} = $temp->attr('id') if $temp->attr('id'); |
| Akron | 35db6e3 | 2016-03-17 22:42:22 +0100 | [diff] [blame] | 278 | }; |
| 279 | } |
| 280 | |
| 281 | # Some meta data only reevant from the text |
| 282 | elsif ($type eq 'text') { |
| 283 | |
| Akron | 6396c30 | 2016-03-18 16:05:39 +0100 | [diff] [blame] | 284 | if ($temp = $dom->at('sourceDesc reference[type=complete]')) { |
| Akron | afb81ad | 2016-08-01 20:28:31 +0200 | [diff] [blame] | 285 | if (my $ref_text = _squish $temp->all_text) { |
| Akron | 3139917 | 2017-01-20 14:13:34 +0100 | [diff] [blame] | 286 | $ref_text =~ s!$REF_RE!!; |
| 287 | $self->{reference} = $ref_text; |
| Akron | 35db6e3 | 2016-03-17 22:42:22 +0100 | [diff] [blame] | 288 | }; |
| 289 | }; |
| 290 | |
| Akron | 6396c30 | 2016-03-18 16:05:39 +0100 | [diff] [blame] | 291 | $temp = $dom->at('textDesc > column'); |
| Akron | afb81ad | 2016-08-01 20:28:31 +0200 | [diff] [blame] | 292 | if ($temp && ($temp = _squish $temp->all_text)) { |
| Akron | 6396c30 | 2016-03-18 16:05:39 +0100 | [diff] [blame] | 293 | $self->{text_column} = $temp; |
| 294 | }; |
| Akron | 35db6e3 | 2016-03-17 22:42:22 +0100 | [diff] [blame] | 295 | |
| Akron | 6396c30 | 2016-03-18 16:05:39 +0100 | [diff] [blame] | 296 | if ($temp = $dom->at('biblStruct biblScope[type=pp]')) { |
| Akron | afb81ad | 2016-08-01 20:28:31 +0200 | [diff] [blame] | 297 | $temp = _squish $temp->all_text; |
| Akron | 6396c30 | 2016-03-18 16:05:39 +0100 | [diff] [blame] | 298 | if ($temp && $temp =~ m/(\d+)\s*-\s*(\d+)/) { |
| Akron | 08d5445 | 2017-02-16 23:19:49 +0100 | [diff] [blame] | 299 | $self->{src_pages} = $1 . '-' . $2; |
| Akron | 35db6e3 | 2016-03-17 22:42:22 +0100 | [diff] [blame] | 300 | }; |
| 301 | }; |
| 302 | }; |
| Akron | fbf6638 | 2016-07-12 19:44:01 +0200 | [diff] [blame] | 303 | |
| 304 | return 1; |
| Akron | 35db6e3 | 2016-03-17 22:42:22 +0100 | [diff] [blame] | 305 | }; |
| 306 | |
| 307 | |
| 308 | sub _remove_prefix { |
| Akron | 35db6e3 | 2016-03-17 22:42:22 +0100 | [diff] [blame] | 309 | # This may render some titles wrong, e.g. 'VDI nachrichten 2014' ... |
| Akron | 6396c30 | 2016-03-18 16:05:39 +0100 | [diff] [blame] | 310 | return $_[0] unless $_[1]; |
| 311 | |
| 312 | my ($title, $prefix) = @_; |
| Akron | 1cd5b87 | 2016-03-22 00:23:46 +0100 | [diff] [blame] | 313 | # $prefix =~ tr!_!/!; |
| 314 | $prefix =~ s!^([^/]+?/[^/]+?)/!$1\.!; |
| Akron | 35db6e3 | 2016-03-17 22:42:22 +0100 | [diff] [blame] | 315 | if (index($title, $prefix) == 0) { |
| 316 | $title = substr($title, length($prefix)); |
| Akron | 0465de5 | 2017-02-07 22:30:08 +0100 | [diff] [blame] | 317 | $title =~ s!^\s*[-;:,]\s*!!; |
| Akron | 35db6e3 | 2016-03-17 22:42:22 +0100 | [diff] [blame] | 318 | }; |
| Akron | 6396c30 | 2016-03-18 16:05:39 +0100 | [diff] [blame] | 319 | |
| Akron | afb81ad | 2016-08-01 20:28:31 +0200 | [diff] [blame] | 320 | return _squish $title; |
| Akron | 35db6e3 | 2016-03-17 22:42:22 +0100 | [diff] [blame] | 321 | }; |
| 322 | |
| 323 | |
| Akron | 35db6e3 | 2016-03-17 22:42:22 +0100 | [diff] [blame] | 324 | 1; |