| Akron | e4c2e41 | 2016-01-28 15:10:50 +0100 | [diff] [blame] | 1 | package KorAP::XML::Krill; |
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 2 | use Mojo::Base -base; |
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 3 | use Mojo::ByteStream 'b'; |
| Akron | 918ce42 | 2017-06-16 20:28:43 +0200 | [diff] [blame] | 4 | use Mojo::Util qw/encode html_unescape/; |
| Akron | 3ec0a1c | 2017-01-18 14:41:55 +0100 | [diff] [blame] | 5 | use Mojo::File; |
| Akron | 14ca9f0 | 2016-01-29 19:38:18 +0100 | [diff] [blame] | 6 | use Scalar::Util qw/weaken/; |
| Nils Diewald | 3cf08c7 | 2013-12-16 20:31:10 +0000 | [diff] [blame] | 7 | use XML::Fast; |
| 8 | use Try::Tiny; |
| Akron | e4c2e41 | 2016-01-28 15:10:50 +0100 | [diff] [blame] | 9 | use KorAP::XML::Document::Primary; |
| Akron | 941c1a6 | 2016-02-23 17:41:41 +0100 | [diff] [blame] | 10 | use KorAP::XML::Tokenizer; |
| Akron | b9c3381 | 2020-10-21 16:19:35 +0200 | [diff] [blame] | 11 | use Log::Any qw($log); |
| Akron | 11c8030 | 2016-03-18 19:44:43 +0100 | [diff] [blame] | 12 | use Cache::FastMmap; |
| Nils Diewald | 7b84722 | 2014-04-23 11:14:00 +0000 | [diff] [blame] | 13 | use Mojo::DOM; |
| Akron | af670ae | 2016-10-24 20:14:32 +0200 | [diff] [blame] | 14 | use File::Spec::Functions qw/catdir catfile catpath splitdir splitpath rel2abs/; |
| Akron | c4ec093 | 2020-08-06 09:19:22 +0200 | [diff] [blame] | 15 | use Exporter 'import'; |
| 16 | |
| Akron | 41127e3 | 2020-08-07 12:46:19 +0200 | [diff] [blame] | 17 | our @EXPORT_OK = qw(get_file_name get_file_name_from_glob); |
| Akron | c4ec093 | 2020-08-06 09:19:22 +0200 | [diff] [blame] | 18 | |
| Akron | 64f7fae | 2022-07-27 12:45:33 +0200 | [diff] [blame^] | 19 | our $VERSION = '0.47'; |
| Nils Diewald | 90410c2 | 2014-11-03 21:04:05 +0000 | [diff] [blame] | 20 | |
| Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 21 | has 'path'; |
| Akron | 35db6e3 | 2016-03-17 22:42:22 +0100 | [diff] [blame] | 22 | has [qw/text_sigle doc_sigle corpus_sigle/]; |
| 23 | has 'meta_type' => 'I5'; |
| Akron | 11c8030 | 2016-03-18 19:44:43 +0100 | [diff] [blame] | 24 | has 'cache'; |
| Akron | 64f7fae | 2022-07-27 12:45:33 +0200 | [diff] [blame^] | 25 | has 'lang'; |
| Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 26 | |
| Nils Diewald | 7b84722 | 2014-04-23 11:14:00 +0000 | [diff] [blame] | 27 | has log => sub { |
| Nils Diewald | 7b84722 | 2014-04-23 11:14:00 +0000 | [diff] [blame] | 28 | return $log; |
| 29 | }; |
| 30 | |
| Akron | 6396c30 | 2016-03-18 16:05:39 +0100 | [diff] [blame] | 31 | # Constructor |
| Nils Diewald | 7b84722 | 2014-04-23 11:14:00 +0000 | [diff] [blame] | 32 | sub new { |
| 33 | my $class = shift; |
| 34 | my $self = bless { @_ }, $class; |
| Akron | 6396c30 | 2016-03-18 16:05:39 +0100 | [diff] [blame] | 35 | |
| 36 | # Path is defined |
| Nils Diewald | d681eab | 2014-11-01 01:18:25 +0000 | [diff] [blame] | 37 | if (exists $self->{path}) { |
| 38 | $self->{path} = rel2abs($self->{path}); |
| 39 | if ($self->{path} !~ m!\/$!) { |
| 40 | $self->{path} .= '/'; |
| 41 | }; |
| Nils Diewald | 7b84722 | 2014-04-23 11:14:00 +0000 | [diff] [blame] | 42 | }; |
| 43 | return $self; |
| 44 | }; |
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 45 | |
| Akron | 35db6e3 | 2016-03-17 22:42:22 +0100 | [diff] [blame] | 46 | |
| 47 | # Parse document (primary data and metadata) |
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 48 | sub parse { |
| 49 | my $self = shift; |
| Akron | 35db6e3 | 2016-03-17 22:42:22 +0100 | [diff] [blame] | 50 | my $meta_data_type = $self->meta_type; |
| Nils Diewald | 7b84722 | 2014-04-23 11:14:00 +0000 | [diff] [blame] | 51 | |
| Akron | 6396c30 | 2016-03-18 16:05:39 +0100 | [diff] [blame] | 52 | state $ENC_RE = qr/^[^>]+encoding\s*=\s*(["'])([^\1]+?)\1/o; |
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 53 | |
| Akron | 6396c30 | 2016-03-18 16:05:39 +0100 | [diff] [blame] | 54 | # Path to primary |
| 55 | my $data_xml = $self->path . 'data.xml'; |
| Nils Diewald | 98767bb | 2014-04-25 20:31:19 +0000 | [diff] [blame] | 56 | my ($rt, $error, $file); |
| 57 | |
| 58 | my $unable = 'Unable to parse document ' . $self->path; |
| 59 | |
| Akron | 35db6e3 | 2016-03-17 22:42:22 +0100 | [diff] [blame] | 60 | # No primary data found |
| Nils Diewald | 98767bb | 2014-04-25 20:31:19 +0000 | [diff] [blame] | 61 | unless (-e $data_xml) { |
| Akron | b9c3381 | 2020-10-21 16:19:35 +0200 | [diff] [blame] | 62 | $log->warn($unable . ' - no data.xml found'); |
| Nils Diewald | 98767bb | 2014-04-25 20:31:19 +0000 | [diff] [blame] | 63 | $error = 1; |
| 64 | } |
| 65 | |
| 66 | else { |
| Akron | 35db6e3 | 2016-03-17 22:42:22 +0100 | [diff] [blame] | 67 | # Load file |
| Akron | 3ec0a1c | 2017-01-18 14:41:55 +0100 | [diff] [blame] | 68 | $file = b(Mojo::File->new($data_xml)->slurp); |
| Akron | a7d0e9f | 2017-02-03 14:36:21 +0100 | [diff] [blame] | 69 | |
| Nils Diewald | 98767bb | 2014-04-25 20:31:19 +0000 | [diff] [blame] | 70 | try { |
| Nils Diewald | 3cf08c7 | 2013-12-16 20:31:10 +0000 | [diff] [blame] | 71 | local $SIG{__WARN__} = sub { |
| Akron | 7d4cdd8 | 2016-08-17 21:39:45 +0200 | [diff] [blame] | 72 | $error = 1; |
| Nils Diewald | 3cf08c7 | 2013-12-16 20:31:10 +0000 | [diff] [blame] | 73 | }; |
| Akron | 7d4cdd8 | 2016-08-17 21:39:45 +0200 | [diff] [blame] | 74 | |
| Nils Diewald | 3cf08c7 | 2013-12-16 20:31:10 +0000 | [diff] [blame] | 75 | $rt = xml2hash($file, text => '#text', attr => '-')->{raw_text}; |
| Akron | a7d0e9f | 2017-02-03 14:36:21 +0100 | [diff] [blame] | 76 | |
| Akron | a866578 | 2016-01-27 21:47:57 +0100 | [diff] [blame] | 77 | } catch { |
| Akron | b9c3381 | 2020-10-21 16:19:35 +0200 | [diff] [blame] | 78 | $log->warn($unable); |
| Akron | a866578 | 2016-01-27 21:47:57 +0100 | [diff] [blame] | 79 | $error = 1; |
| 80 | }; |
| Nils Diewald | 3cf08c7 | 2013-12-16 20:31:10 +0000 | [diff] [blame] | 81 | }; |
| 82 | |
| 83 | return if $error; |
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 84 | |
| Akron | b9c3381 | 2020-10-21 16:19:35 +0200 | [diff] [blame] | 85 | $log->debug('Parse document ' . $self->path); |
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 86 | |
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 87 | # Get document id and corpus id |
| Nils Diewald | 3cf08c7 | 2013-12-16 20:31:10 +0000 | [diff] [blame] | 88 | if ($rt && $rt->{'-docid'}) { |
| Akron | 1cd5b87 | 2016-03-22 00:23:46 +0100 | [diff] [blame] | 89 | if ($rt->{'-docid'} =~ /^([^_]+)_([^\._]+?)\.(.+?)$/) { |
| 90 | $self->text_sigle(join('/', $1, $2, $3)); |
| 91 | $self->doc_sigle(join('/', $1, $2)); |
| 92 | $self->corpus_sigle($1); |
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 93 | } |
| 94 | else { |
| Akron | b9c3381 | 2020-10-21 16:19:35 +0200 | [diff] [blame] | 95 | $log->warn($unable . ': ID not parseable: ' . $rt->{'-docid'}); |
| Akron | 7d4cdd8 | 2016-08-17 21:39:45 +0200 | [diff] [blame] | 96 | return; |
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 97 | }; |
| 98 | } |
| 99 | else { |
| Akron | b9c3381 | 2020-10-21 16:19:35 +0200 | [diff] [blame] | 100 | $log->warn($unable . ': No raw_text found or no ID'); |
| Akron | 7d4cdd8 | 2016-08-17 21:39:45 +0200 | [diff] [blame] | 101 | return; |
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 102 | }; |
| 103 | |
| Akron | 918ce42 | 2017-06-16 20:28:43 +0200 | [diff] [blame] | 104 | # Get primary data (was my "$pd = $rt->{text};" before) |
| 105 | # Unfortunately xml2hash removes spaces at the start and at |
| 106 | # the end of a text node, making it impossible to deal with cmc data. |
| 107 | $file =~ $ENC_RE; |
| 108 | $file = $file->decode($2 // 'UTF-8'); |
| 109 | my $start = index($file, '<text>') + 6; |
| 110 | my $end = index($file, '</text>'); |
| 111 | my $pd = html_unescape substr($file, $start, $end - $start); |
| Akron | 6396c30 | 2016-03-18 16:05:39 +0100 | [diff] [blame] | 112 | |
| Akron | 7d4cdd8 | 2016-08-17 21:39:45 +0200 | [diff] [blame] | 113 | unless ($pd) { |
| Akron | b9c3381 | 2020-10-21 16:19:35 +0200 | [diff] [blame] | 114 | $log->warn($unable . ': No primary data found'); |
| Akron | 7d4cdd8 | 2016-08-17 21:39:45 +0200 | [diff] [blame] | 115 | return; |
| 116 | }; |
| Akron | 087d5db | 2016-10-24 18:14:22 +0200 | [diff] [blame] | 117 | |
| Akron | 6396c30 | 2016-03-18 16:05:39 +0100 | [diff] [blame] | 118 | # Associate primary data |
| 119 | $self->{pd} = KorAP::XML::Document::Primary->new($pd); |
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 120 | |
| Nils Diewald | d681eab | 2014-11-01 01:18:25 +0000 | [diff] [blame] | 121 | my @path = grep { $_ } splitdir($self->path); |
| Nils Diewald | 840c924 | 2014-10-28 19:51:26 +0000 | [diff] [blame] | 122 | my @header; |
| 123 | |
| Akron | 35db6e3 | 2016-03-17 22:42:22 +0100 | [diff] [blame] | 124 | # Parse the corpus file, the doc file, |
| 125 | # and the text file for meta information |
| Nils Diewald | 840c924 | 2014-10-28 19:51:26 +0000 | [diff] [blame] | 126 | foreach (0..2) { |
| Nils Diewald | 0e48977 | 2016-10-24 15:16:52 +0200 | [diff] [blame] | 127 | # Removed starting '/' |
| Akron | af670ae | 2016-10-24 20:14:32 +0200 | [diff] [blame] | 128 | my $header = ($^O =~ /^mswin/i ? '' : '/'); |
| 129 | $header .= catfile(@path, 'header.xml'); |
| 130 | unshift @header, $header; |
| Nils Diewald | 840c924 | 2014-10-28 19:51:26 +0000 | [diff] [blame] | 131 | pop @path; |
| 132 | }; |
| Akron | b2636cf | 2016-01-26 18:42:44 +0100 | [diff] [blame] | 133 | |
| Akron | 6396c30 | 2016-03-18 16:05:39 +0100 | [diff] [blame] | 134 | # Get metadata class and create an object |
| Akron | 35db6e3 | 2016-03-17 22:42:22 +0100 | [diff] [blame] | 135 | my $meta_class = 'KorAP::XML::Meta::' . $meta_data_type; |
| 136 | my $meta; |
| 137 | |
| 138 | if ($meta_class->can('new') || eval("require $meta_class; 1;")) { |
| 139 | $meta = $meta_class->new( |
| Akron | b9c3381 | 2020-10-21 16:19:35 +0200 | [diff] [blame] | 140 | log => $log, |
| Akron | 35db6e3 | 2016-03-17 22:42:22 +0100 | [diff] [blame] | 141 | corpus_sigle => $self->corpus_sigle, |
| 142 | doc_sigle => $self->doc_sigle, |
| Akron | 11c8030 | 2016-03-18 19:44:43 +0100 | [diff] [blame] | 143 | text_sigle => $self->text_sigle, |
| Akron | 64f7fae | 2022-07-27 12:45:33 +0200 | [diff] [blame^] | 144 | cache => $self->cache, |
| 145 | lang => $self->lang |
| Akron | 35db6e3 | 2016-03-17 22:42:22 +0100 | [diff] [blame] | 146 | ); |
| 147 | |
| Akron | 6396c30 | 2016-03-18 16:05:39 +0100 | [diff] [blame] | 148 | # Associate meta object |
| Akron | 35db6e3 | 2016-03-17 22:42:22 +0100 | [diff] [blame] | 149 | $self->{meta} = $meta; |
| 150 | }; |
| 151 | |
| Akron | 6396c30 | 2016-03-18 16:05:39 +0100 | [diff] [blame] | 152 | unless ($meta) { |
| Akron | b9c3381 | 2020-10-21 16:19:35 +0200 | [diff] [blame] | 153 | $log->warn( |
| Akron | 6396c30 | 2016-03-18 16:05:39 +0100 | [diff] [blame] | 154 | "Metadata object for $meta_data_type not initializable" |
| 155 | ); |
| 156 | }; |
| Akron | 35db6e3 | 2016-03-17 22:42:22 +0100 | [diff] [blame] | 157 | |
| Nils Diewald | 840c924 | 2014-10-28 19:51:26 +0000 | [diff] [blame] | 158 | my @type = qw/corpus doc text/; |
| 159 | foreach (@header) { |
| 160 | # Get corpus, doc and text meta data |
| 161 | my $type = shift(@type); |
| Akron | a866578 | 2016-01-27 21:47:57 +0100 | [diff] [blame] | 162 | |
| Akron | 11c8030 | 2016-03-18 19:44:43 +0100 | [diff] [blame] | 163 | # Check for cache |
| 164 | next if $meta->is_cached($type); |
| 165 | |
| Akron | a866578 | 2016-01-27 21:47:57 +0100 | [diff] [blame] | 166 | next unless -e $_; |
| 167 | |
| Akron | 35db6e3 | 2016-03-17 22:42:22 +0100 | [diff] [blame] | 168 | # Slurp data and probably decode |
| Akron | 3ec0a1c | 2017-01-18 14:41:55 +0100 | [diff] [blame] | 169 | my $slurp = b(Mojo::File->new($_)->slurp); |
| Akron | 6396c30 | 2016-03-18 16:05:39 +0100 | [diff] [blame] | 170 | $slurp =~ $ENC_RE; |
| Akron | a866578 | 2016-01-27 21:47:57 +0100 | [diff] [blame] | 171 | my $file = $slurp->decode($2 // 'UTF-8'); |
| 172 | |
| 173 | # Get DOM |
| 174 | my $dom = Mojo::DOM->new($file); |
| 175 | |
| Akron | 6396c30 | 2016-03-18 16:05:39 +0100 | [diff] [blame] | 176 | # Parse object based on DOM |
| Akron | 35db6e3 | 2016-03-17 22:42:22 +0100 | [diff] [blame] | 177 | $meta->parse($dom, $type); |
| Akron | 11c8030 | 2016-03-18 19:44:43 +0100 | [diff] [blame] | 178 | $meta->do_cache($type); |
| Nils Diewald | 840c924 | 2014-10-28 19:51:26 +0000 | [diff] [blame] | 179 | }; |
| 180 | |
| Akron | 14ca9f0 | 2016-01-29 19:38:18 +0100 | [diff] [blame] | 181 | return $self; |
| 182 | }; |
| 183 | |
| 184 | |
| Akron | 47426f0 | 2020-08-06 13:28:53 +0200 | [diff] [blame] | 185 | # Start token parsing |
| Akron | 14ca9f0 | 2016-01-29 19:38:18 +0100 | [diff] [blame] | 186 | sub tokenize { |
| 187 | my $self = shift; |
| 188 | my ($token_foundry, $token_layer) = @_; |
| 189 | |
| 190 | $token_foundry //= 'OpenNLP'; |
| 191 | $token_layer //= 'Tokens'; |
| 192 | |
| Akron | 941c1a6 | 2016-02-23 17:41:41 +0100 | [diff] [blame] | 193 | # Create tokenizer |
| 194 | my $tokens = KorAP::XML::Tokenizer->new( |
| Akron | 14ca9f0 | 2016-01-29 19:38:18 +0100 | [diff] [blame] | 195 | path => $self->path, |
| 196 | doc => $self, |
| 197 | foundry => $token_foundry, |
| 198 | layer => $token_layer, |
| 199 | name => 'tokens' |
| 200 | ); |
| 201 | |
| Akron | 941c1a6 | 2016-02-23 17:41:41 +0100 | [diff] [blame] | 202 | # Parse tokens |
| Akron | 14ca9f0 | 2016-01-29 19:38:18 +0100 | [diff] [blame] | 203 | unless ($tokens->parse) { |
| Akron | b9c3381 | 2020-10-21 16:19:35 +0200 | [diff] [blame] | 204 | $log->warn( |
| Akron | 14ca9f0 | 2016-01-29 19:38:18 +0100 | [diff] [blame] | 205 | 'Unable to tokenize ' . $self->path . |
| Akron | 4701d09 | 2020-08-04 15:20:19 +0200 | [diff] [blame] | 206 | ' with ' . $token_foundry . '#' |
| 207 | . $token_layer |
| 208 | ); |
| Akron | 14ca9f0 | 2016-01-29 19:38:18 +0100 | [diff] [blame] | 209 | } |
| 210 | else { |
| 211 | weaken $self; |
| 212 | $self->{tokenizer} = $tokens; |
| 213 | }; |
| 214 | |
| 215 | return $self; |
| 216 | }; |
| 217 | |
| 218 | |
| 219 | # Add annotation |
| 220 | sub annotate { |
| 221 | my $self = shift; |
| 222 | unless ($self->{tokenizer}) { |
| Akron | b9c3381 | 2020-10-21 16:19:35 +0200 | [diff] [blame] | 223 | $log->warn('No tokenizer defined') |
| Akron | 14ca9f0 | 2016-01-29 19:38:18 +0100 | [diff] [blame] | 224 | } |
| 225 | else { |
| 226 | $self->{tokenizer}->add(@_); |
| 227 | }; |
| 228 | |
| 229 | $self; |
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 230 | }; |
| 231 | |
| 232 | |
| Akron | a866578 | 2016-01-27 21:47:57 +0100 | [diff] [blame] | 233 | # Store arbitrary data |
| 234 | sub store { |
| 235 | my $self = shift; |
| 236 | return $self->{store} unless @_; |
| 237 | return $self->{store}->{$_[0]} if @_ == 1; |
| 238 | $self->{store}->{$_[0]} = $_[1]; |
| 239 | }; |
| 240 | |
| 241 | |
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 242 | # Primary data |
| 243 | sub primary { |
| 244 | $_[0]->{pd}; |
| 245 | }; |
| 246 | |
| Akron | 47426f0 | 2020-08-06 13:28:53 +0200 | [diff] [blame] | 247 | |
| 248 | # Get meta object |
| Akron | 35db6e3 | 2016-03-17 22:42:22 +0100 | [diff] [blame] | 249 | sub meta { |
| 250 | return $_[0]->{meta}; |
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 251 | }; |
| 252 | |
| Akron | 47426f0 | 2020-08-06 13:28:53 +0200 | [diff] [blame] | 253 | |
| 254 | # Serialize to hash |
| Akron | 35db6e3 | 2016-03-17 22:42:22 +0100 | [diff] [blame] | 255 | sub to_hash { |
| Nils Diewald | 840c924 | 2014-10-28 19:51:26 +0000 | [diff] [blame] | 256 | my $self = shift; |
| Nils Diewald | 840c924 | 2014-10-28 19:51:26 +0000 | [diff] [blame] | 257 | |
| Akron | 35db6e3 | 2016-03-17 22:42:22 +0100 | [diff] [blame] | 258 | $self->parse unless $self->text_sigle; |
| Nils Diewald | feccbb1 | 2015-06-18 20:06:45 +0000 | [diff] [blame] | 259 | |
| Akron | 35db6e3 | 2016-03-17 22:42:22 +0100 | [diff] [blame] | 260 | my %hash; |
| Nils Diewald | feccbb1 | 2015-06-18 20:06:45 +0000 | [diff] [blame] | 261 | |
| Akron | 35db6e3 | 2016-03-17 22:42:22 +0100 | [diff] [blame] | 262 | # Get meta object |
| 263 | my $meta = $self->meta; |
| Akron | 11c8030 | 2016-03-18 19:44:43 +0100 | [diff] [blame] | 264 | foreach ($meta->keys) { |
| Nils Diewald | 840c924 | 2014-10-28 19:51:26 +0000 | [diff] [blame] | 265 | |
| Akron | 35db6e3 | 2016-03-17 22:42:22 +0100 | [diff] [blame] | 266 | my $v = $meta->{$_}; |
| 267 | if (ref $v) { |
| 268 | $hash{_k($_)} = $meta->keywords($_); |
| Nils Diewald | 90410c2 | 2014-11-03 21:04:05 +0000 | [diff] [blame] | 269 | } |
| Akron | 35db6e3 | 2016-03-17 22:42:22 +0100 | [diff] [blame] | 270 | else { |
| Akron | 4701d09 | 2020-08-04 15:20:19 +0200 | [diff] [blame] | 271 | $v =~ tr/\n/ /; |
| Akron | 35db6e3 | 2016-03-17 22:42:22 +0100 | [diff] [blame] | 272 | $v =~ s/\s\s+/ /g; |
| 273 | $hash{_k($_)} = $v; |
| Nils Diewald | 840c924 | 2014-10-28 19:51:26 +0000 | [diff] [blame] | 274 | }; |
| Nils Diewald | 90410c2 | 2014-11-03 21:04:05 +0000 | [diff] [blame] | 275 | }; |
| Nils Diewald | 840c924 | 2014-10-28 19:51:26 +0000 | [diff] [blame] | 276 | |
| Akron | 35db6e3 | 2016-03-17 22:42:22 +0100 | [diff] [blame] | 277 | foreach (qw/corpus doc text/) { |
| 278 | $hash{$_ . 'Sigle'} = $self->{$_ . '_sigle'}; |
| Nils Diewald | 8e323ee | 2014-04-23 17:28:14 +0000 | [diff] [blame] | 279 | }; |
| 280 | |
| Akron | 35db6e3 | 2016-03-17 22:42:22 +0100 | [diff] [blame] | 281 | return \%hash; |
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 282 | }; |
| 283 | |
| Nils Diewald | 840c924 | 2014-10-28 19:51:26 +0000 | [diff] [blame] | 284 | |
| Akron | 35db6e3 | 2016-03-17 22:42:22 +0100 | [diff] [blame] | 285 | sub _k { |
| Akron | 4701d09 | 2020-08-04 15:20:19 +0200 | [diff] [blame] | 286 | substr($_[0], 2) =~ s/_(\w)/\U$1\E/gr =~ s/id$/ID/gir; |
| Akron | 35db6e3 | 2016-03-17 22:42:22 +0100 | [diff] [blame] | 287 | }; |
| 288 | |
| 289 | |
| 290 | sub to_json { |
| 291 | my $self = shift; |
| 292 | unless ($self->{tokenizer}) { |
| Akron | b9c3381 | 2020-10-21 16:19:35 +0200 | [diff] [blame] | 293 | $log->warn('No tokenizer defined'); |
| Akron | 35db6e3 | 2016-03-17 22:42:22 +0100 | [diff] [blame] | 294 | return; |
| 295 | }; |
| 296 | |
| 297 | return $self->{tokenizer}->to_json; |
| 298 | }; |
| 299 | |
| Akron | c4ec093 | 2020-08-06 09:19:22 +0200 | [diff] [blame] | 300 | # Functions |
| 301 | |
| 302 | sub get_file_name_from_glob ($) { |
| 303 | my $glob = shift; |
| 304 | $glob =~ s![\\\/},]!-!g; # Transform paths |
| 305 | $glob =~ s/[\*\?]//g; # Remove arbitrary fills |
| 306 | $glob =~ s/[\{\}\[\]]/-/g; # Remove class and multiple brackets |
| 307 | $glob =~ s/\-\-+/-/g; # Remove sequences of binding characters |
| 308 | $glob =~ s/^-//; # Clean beginning |
| 309 | $glob =~ s/\.zip$//; # Remove file extension |
| 310 | $glob =~ s/-$//; # Clean end |
| 311 | return $glob; |
| 312 | }; |
| 313 | |
| Akron | 35db6e3 | 2016-03-17 22:42:22 +0100 | [diff] [blame] | 314 | |
| Akron | 41127e3 | 2020-08-07 12:46:19 +0200 | [diff] [blame] | 315 | # Get file name based on path information |
| 316 | sub get_file_name ($$) { |
| 317 | my $i = shift; |
| 318 | |
| 319 | # Check if the base dir is a directory |
| 320 | if (-d $i) { |
| 321 | |
| 322 | # Remove following slashes |
| 323 | $i =~ s![^\/]+$!!; |
| 324 | }; |
| 325 | my $file = shift; |
| 326 | |
| 327 | # Remove temp dir fragments |
| 328 | $file =~ s!^/?tmp/[^/]+!!; |
| 329 | $file =~ s/^?\/?$i//; |
| 330 | $file =~ tr/\//-/; |
| 331 | $file =~ s{^-+}{}; |
| 332 | $file =~ s/^.*?-(.+?-.+?-.+?)$/$1/; # shorten |
| 333 | return $file; |
| 334 | }; |
| 335 | |
| 336 | |
| Akron | 35db6e3 | 2016-03-17 22:42:22 +0100 | [diff] [blame] | 337 | 1; |
| 338 | |
| 339 | |
| 340 | __END__ |
| Nils Diewald | feccbb1 | 2015-06-18 20:06:45 +0000 | [diff] [blame] | 341 | |
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 342 | =pod |
| 343 | |
| Akron | 31d788e | 2016-02-05 20:49:03 +0100 | [diff] [blame] | 344 | =encoding utf8 |
| 345 | |
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 346 | =head1 NAME |
| 347 | |
| Akron | 31d788e | 2016-02-05 20:49:03 +0100 | [diff] [blame] | 348 | KorAP::XML::Krill - Preprocess KorAP XML documents for Krill |
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 349 | |
| 350 | |
| 351 | =head1 SYNOPSIS |
| 352 | |
| Akron | 31d788e | 2016-02-05 20:49:03 +0100 | [diff] [blame] | 353 | # Create Converter Object |
| Akron | e4c2e41 | 2016-01-28 15:10:50 +0100 | [diff] [blame] | 354 | my $doc = KorAP::XML::Krill->new( |
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 355 | path => 'mydoc-1/' |
| 356 | ); |
| 357 | |
| Akron | 31d788e | 2016-02-05 20:49:03 +0100 | [diff] [blame] | 358 | # Convert to krill json |
| 359 | print $doc->parse->tokenize->annotate('Mate', 'Morpho')->to_json; |
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 360 | |
| 361 | |
| 362 | =head1 DESCRIPTION |
| 363 | |
| Akron | 31d788e | 2016-02-05 20:49:03 +0100 | [diff] [blame] | 364 | Parse the primary and meta data of a KorAP-XML document. |
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 365 | |
| 366 | |
| Akron | 31d788e | 2016-02-05 20:49:03 +0100 | [diff] [blame] | 367 | =head1 ATTRIBUTES |
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 368 | |
| Akron | 31d788e | 2016-02-05 20:49:03 +0100 | [diff] [blame] | 369 | =head2 log |
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 370 | |
| Akron | b9c3381 | 2020-10-21 16:19:35 +0200 | [diff] [blame] | 371 | L<Log::Any> object for logging. |
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 372 | |
| 373 | =head2 path |
| 374 | |
| 375 | $doc->path("example-004/"); |
| 376 | print $doc->path; |
| 377 | |
| 378 | The path of the document. |
| 379 | |
| 380 | |
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 381 | =head2 primary |
| 382 | |
| 383 | print $doc->primary->data(0,20); |
| 384 | |
| Akron | e4c2e41 | 2016-01-28 15:10:50 +0100 | [diff] [blame] | 385 | The L<KorAP::XML::Document::Primary> object containing the primary data. |
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 386 | |
| 387 | |
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 388 | =head1 METHODS |
| 389 | |
| Akron | 31d788e | 2016-02-05 20:49:03 +0100 | [diff] [blame] | 390 | =head2 annotate |
| 391 | |
| Akron | a5920b1 | 2016-06-29 18:51:21 +0200 | [diff] [blame] | 392 | $doc->annotate('Mate', 'Morpho'); |
| Akron | 31d788e | 2016-02-05 20:49:03 +0100 | [diff] [blame] | 393 | |
| 394 | Add annotation layer to conversion process. |
| 395 | |
| 396 | |
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 397 | =head2 parse |
| 398 | |
| Akron | 31d788e | 2016-02-05 20:49:03 +0100 | [diff] [blame] | 399 | $doc = $doc->parse; |
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 400 | |
| Akron | 31d788e | 2016-02-05 20:49:03 +0100 | [diff] [blame] | 401 | Run the meta parsing process of the document. |
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 402 | |
| 403 | |
| Akron | 31d788e | 2016-02-05 20:49:03 +0100 | [diff] [blame] | 404 | =head2 tokenize |
| 405 | |
| 406 | $doc = $doc->tokenize('OpenNLP', 'Tokens'); |
| 407 | |
| 408 | Accept the tokenization based on a given foundry and a given layer. |
| 409 | |
| 410 | |
| 411 | =head1 AVAILABILITY |
| 412 | |
| 413 | https://github.com/KorAP/KorAP-XML-Krill |
| 414 | |
| 415 | |
| 416 | =head1 COPYRIGHT AND LICENSE |
| 417 | |
| Akron | 6882d7d | 2021-02-08 09:43:57 +0100 | [diff] [blame] | 418 | Copyright (C) 2015-2021, L<IDS Mannheim|https://www.ids-mannheim.de/> |
| 419 | Author: L<Nils Diewald|https://www.nils-diewald.de/> |
| Akron | 31d788e | 2016-02-05 20:49:03 +0100 | [diff] [blame] | 420 | |
| 421 | KorAP::XML::Krill is developed as part of the |
| 422 | L<KorAP|http://korap.ids-mannheim.de/> |
| 423 | Corpus Analysis Platform at the |
| Akron | d4c5c10 | 2020-02-11 11:47:59 +0100 | [diff] [blame] | 424 | L<Institute for the German Language (IDS)|https://www.ids-mannheim.de/>, |
| Akron | 31d788e | 2016-02-05 20:49:03 +0100 | [diff] [blame] | 425 | member of the |
| Akron | d4c5c10 | 2020-02-11 11:47:59 +0100 | [diff] [blame] | 426 | L<Leibniz-Gemeinschaft|https://www.leibniz-gemeinschaft.de/en/> |
| Akron | 31d788e | 2016-02-05 20:49:03 +0100 | [diff] [blame] | 427 | and supported by the L<KobRA|http://www.kobra.tu-dortmund.de> project, |
| 428 | funded by the |
| 429 | L<Federal Ministry of Education and Research (BMBF)|http://www.bmbf.de/en/>. |
| 430 | |
| 431 | KorAP::XML::Krill is free software published under the |
| Akron | 6882d7d | 2021-02-08 09:43:57 +0100 | [diff] [blame] | 432 | L<BSD-2 License|https://opensource.org/licenses/BSD-2-Clause>. |
| Akron | 31d788e | 2016-02-05 20:49:03 +0100 | [diff] [blame] | 433 | |
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 434 | =cut |