| Akron | e4c2e41 | 2016-01-28 15:10:50 +0100 | [diff] [blame] | 1 | package KorAP::XML::Krill; |
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 2 | use Mojo::Base -base; |
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 3 | use Mojo::ByteStream 'b'; |
| Nils Diewald | 7b84722 | 2014-04-23 11:14:00 +0000 | [diff] [blame] | 4 | use Mojo::Util qw/encode/; |
| Akron | 3ec0a1c | 2017-01-18 14:41:55 +0100 | [diff] [blame] | 5 | use Mojo::File; |
| Akron | 14ca9f0 | 2016-01-29 19:38:18 +0100 | [diff] [blame] | 6 | use Scalar::Util qw/weaken/; |
| Nils Diewald | 3cf08c7 | 2013-12-16 20:31:10 +0000 | [diff] [blame] | 7 | use XML::Fast; |
| 8 | use Try::Tiny; |
| Akron | 7d4cdd8 | 2016-08-17 21:39:45 +0200 | [diff] [blame] | 9 | use Carp qw/croak carp/; |
| Akron | e4c2e41 | 2016-01-28 15:10:50 +0100 | [diff] [blame] | 10 | use KorAP::XML::Document::Primary; |
| Akron | 941c1a6 | 2016-02-23 17:41:41 +0100 | [diff] [blame] | 11 | use KorAP::XML::Tokenizer; |
| Nils Diewald | 7b84722 | 2014-04-23 11:14:00 +0000 | [diff] [blame] | 12 | use Log::Log4perl; |
| Akron | e4c2e41 | 2016-01-28 15:10:50 +0100 | [diff] [blame] | 13 | use KorAP::XML::Log; |
| Akron | 11c8030 | 2016-03-18 19:44:43 +0100 | [diff] [blame] | 14 | use Cache::FastMmap; |
| Nils Diewald | 7b84722 | 2014-04-23 11:14:00 +0000 | [diff] [blame] | 15 | use Mojo::DOM; |
| 16 | use Data::Dumper; |
| Akron | af670ae | 2016-10-24 20:14:32 +0200 | [diff] [blame] | 17 | use File::Spec::Functions qw/catdir catfile catpath splitdir splitpath rel2abs/; |
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 18 | |
| Akron | 3887301 | 2017-02-06 20:27:37 +0100 | [diff] [blame] | 19 | our $VERSION = '0.25'; |
| Nils Diewald | 90410c2 | 2014-11-03 21:04:05 +0000 | [diff] [blame] | 20 | |
| Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 21 | has 'path'; |
| Akron | 35db6e3 | 2016-03-17 22:42:22 +0100 | [diff] [blame] | 22 | has [qw/text_sigle doc_sigle corpus_sigle/]; |
| 23 | has 'meta_type' => 'I5'; |
| Akron | 11c8030 | 2016-03-18 19:44:43 +0100 | [diff] [blame] | 24 | has 'cache'; |
| Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 25 | |
| Nils Diewald | 7b84722 | 2014-04-23 11:14:00 +0000 | [diff] [blame] | 26 | has log => sub { |
| 27 | if(Log::Log4perl->initialized()) { |
| 28 | state $log = Log::Log4perl->get_logger(__PACKAGE__); |
| Nils Diewald | 7b84722 | 2014-04-23 11:14:00 +0000 | [diff] [blame] | 29 | }; |
| Akron | e4c2e41 | 2016-01-28 15:10:50 +0100 | [diff] [blame] | 30 | state $log = KorAP::XML::Log->new; |
| Nils Diewald | 7b84722 | 2014-04-23 11:14:00 +0000 | [diff] [blame] | 31 | return $log; |
| 32 | }; |
| 33 | |
| Akron | 6396c30 | 2016-03-18 16:05:39 +0100 | [diff] [blame] | 34 | # Constructor |
| Nils Diewald | 7b84722 | 2014-04-23 11:14:00 +0000 | [diff] [blame] | 35 | sub new { |
| 36 | my $class = shift; |
| 37 | my $self = bless { @_ }, $class; |
| Akron | 6396c30 | 2016-03-18 16:05:39 +0100 | [diff] [blame] | 38 | |
| 39 | # Path is defined |
| Nils Diewald | d681eab | 2014-11-01 01:18:25 +0000 | [diff] [blame] | 40 | if (exists $self->{path}) { |
| 41 | $self->{path} = rel2abs($self->{path}); |
| 42 | if ($self->{path} !~ m!\/$!) { |
| 43 | $self->{path} .= '/'; |
| 44 | }; |
| Nils Diewald | 7b84722 | 2014-04-23 11:14:00 +0000 | [diff] [blame] | 45 | }; |
| 46 | return $self; |
| 47 | }; |
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 48 | |
| Akron | 35db6e3 | 2016-03-17 22:42:22 +0100 | [diff] [blame] | 49 | |
| 50 | # Parse document (primary data and metadata) |
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 51 | sub parse { |
| 52 | my $self = shift; |
| Akron | 35db6e3 | 2016-03-17 22:42:22 +0100 | [diff] [blame] | 53 | my $meta_data_type = $self->meta_type; |
| Nils Diewald | 7b84722 | 2014-04-23 11:14:00 +0000 | [diff] [blame] | 54 | |
| Akron | 6396c30 | 2016-03-18 16:05:39 +0100 | [diff] [blame] | 55 | state $ENC_RE = qr/^[^>]+encoding\s*=\s*(["'])([^\1]+?)\1/o; |
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 56 | |
| Akron | 6396c30 | 2016-03-18 16:05:39 +0100 | [diff] [blame] | 57 | # Path to primary |
| 58 | my $data_xml = $self->path . 'data.xml'; |
| Nils Diewald | 98767bb | 2014-04-25 20:31:19 +0000 | [diff] [blame] | 59 | my ($rt, $error, $file); |
| 60 | |
| 61 | my $unable = 'Unable to parse document ' . $self->path; |
| 62 | |
| Akron | 35db6e3 | 2016-03-17 22:42:22 +0100 | [diff] [blame] | 63 | # No primary data found |
| Nils Diewald | 98767bb | 2014-04-25 20:31:19 +0000 | [diff] [blame] | 64 | unless (-e $data_xml) { |
| 65 | $self->log->warn($unable . ' - no data.xml found'); |
| 66 | $error = 1; |
| 67 | } |
| 68 | |
| 69 | else { |
| Akron | 35db6e3 | 2016-03-17 22:42:22 +0100 | [diff] [blame] | 70 | # Load file |
| Akron | 3ec0a1c | 2017-01-18 14:41:55 +0100 | [diff] [blame] | 71 | $file = b(Mojo::File->new($data_xml)->slurp); |
| Akron | a7d0e9f | 2017-02-03 14:36:21 +0100 | [diff] [blame] | 72 | |
| Nils Diewald | 98767bb | 2014-04-25 20:31:19 +0000 | [diff] [blame] | 73 | try { |
| Nils Diewald | 3cf08c7 | 2013-12-16 20:31:10 +0000 | [diff] [blame] | 74 | local $SIG{__WARN__} = sub { |
| Akron | 7d4cdd8 | 2016-08-17 21:39:45 +0200 | [diff] [blame] | 75 | $error = 1; |
| Nils Diewald | 3cf08c7 | 2013-12-16 20:31:10 +0000 | [diff] [blame] | 76 | }; |
| Akron | 7d4cdd8 | 2016-08-17 21:39:45 +0200 | [diff] [blame] | 77 | |
| Nils Diewald | 3cf08c7 | 2013-12-16 20:31:10 +0000 | [diff] [blame] | 78 | $rt = xml2hash($file, text => '#text', attr => '-')->{raw_text}; |
| Akron | a7d0e9f | 2017-02-03 14:36:21 +0100 | [diff] [blame] | 79 | |
| Akron | a866578 | 2016-01-27 21:47:57 +0100 | [diff] [blame] | 80 | } catch { |
| 81 | $self->log->warn($unable); |
| 82 | $error = 1; |
| 83 | }; |
| Nils Diewald | 3cf08c7 | 2013-12-16 20:31:10 +0000 | [diff] [blame] | 84 | }; |
| 85 | |
| 86 | return if $error; |
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 87 | |
| Nils Diewald | 3ece630 | 2013-12-02 18:38:16 +0000 | [diff] [blame] | 88 | $self->log->debug('Parse document ' . $self->path); |
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 89 | |
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 90 | # Get document id and corpus id |
| Nils Diewald | 3cf08c7 | 2013-12-16 20:31:10 +0000 | [diff] [blame] | 91 | if ($rt && $rt->{'-docid'}) { |
| Akron | 1cd5b87 | 2016-03-22 00:23:46 +0100 | [diff] [blame] | 92 | if ($rt->{'-docid'} =~ /^([^_]+)_([^\._]+?)\.(.+?)$/) { |
| 93 | $self->text_sigle(join('/', $1, $2, $3)); |
| 94 | $self->doc_sigle(join('/', $1, $2)); |
| 95 | $self->corpus_sigle($1); |
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 96 | } |
| 97 | else { |
| Akron | 7d4cdd8 | 2016-08-17 21:39:45 +0200 | [diff] [blame] | 98 | $self->log->warn($unable . ': ID not parseable'); |
| 99 | return; |
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 100 | }; |
| 101 | } |
| 102 | else { |
| Akron | 7d4cdd8 | 2016-08-17 21:39:45 +0200 | [diff] [blame] | 103 | $self->log->warn($unable . ': No raw_text found or no ID'); |
| 104 | return; |
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 105 | }; |
| 106 | |
| 107 | # Get primary data |
| Nils Diewald | 3cf08c7 | 2013-12-16 20:31:10 +0000 | [diff] [blame] | 108 | my $pd = $rt->{text}; |
| Akron | 6396c30 | 2016-03-18 16:05:39 +0100 | [diff] [blame] | 109 | |
| Akron | 7d4cdd8 | 2016-08-17 21:39:45 +0200 | [diff] [blame] | 110 | unless ($pd) { |
| 111 | $self->log->warn($unable . ': No primary data found'); |
| 112 | return; |
| 113 | }; |
| Akron | 087d5db | 2016-10-24 18:14:22 +0200 | [diff] [blame] | 114 | |
| Akron | 6396c30 | 2016-03-18 16:05:39 +0100 | [diff] [blame] | 115 | # Associate primary data |
| 116 | $self->{pd} = KorAP::XML::Document::Primary->new($pd); |
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 117 | |
| Nils Diewald | d681eab | 2014-11-01 01:18:25 +0000 | [diff] [blame] | 118 | my @path = grep { $_ } splitdir($self->path); |
| Nils Diewald | 840c924 | 2014-10-28 19:51:26 +0000 | [diff] [blame] | 119 | my @header; |
| 120 | |
| Akron | 35db6e3 | 2016-03-17 22:42:22 +0100 | [diff] [blame] | 121 | # Parse the corpus file, the doc file, |
| 122 | # and the text file for meta information |
| Nils Diewald | 840c924 | 2014-10-28 19:51:26 +0000 | [diff] [blame] | 123 | foreach (0..2) { |
| Nils Diewald | 0e48977 | 2016-10-24 15:16:52 +0200 | [diff] [blame] | 124 | # Removed starting '/' |
| Akron | af670ae | 2016-10-24 20:14:32 +0200 | [diff] [blame] | 125 | my $header = ($^O =~ /^mswin/i ? '' : '/'); |
| 126 | $header .= catfile(@path, 'header.xml'); |
| 127 | unshift @header, $header; |
| Nils Diewald | 840c924 | 2014-10-28 19:51:26 +0000 | [diff] [blame] | 128 | pop @path; |
| 129 | }; |
| Akron | b2636cf | 2016-01-26 18:42:44 +0100 | [diff] [blame] | 130 | |
| Akron | 6396c30 | 2016-03-18 16:05:39 +0100 | [diff] [blame] | 131 | # Get metadata class and create an object |
| Akron | 35db6e3 | 2016-03-17 22:42:22 +0100 | [diff] [blame] | 132 | my $meta_class = 'KorAP::XML::Meta::' . $meta_data_type; |
| 133 | my $meta; |
| 134 | |
| 135 | if ($meta_class->can('new') || eval("require $meta_class; 1;")) { |
| 136 | $meta = $meta_class->new( |
| Akron | 6396c30 | 2016-03-18 16:05:39 +0100 | [diff] [blame] | 137 | log => $self->log, |
| Akron | 35db6e3 | 2016-03-17 22:42:22 +0100 | [diff] [blame] | 138 | corpus_sigle => $self->corpus_sigle, |
| 139 | doc_sigle => $self->doc_sigle, |
| Akron | 11c8030 | 2016-03-18 19:44:43 +0100 | [diff] [blame] | 140 | text_sigle => $self->text_sigle, |
| 141 | cache => $self->cache |
| Akron | 35db6e3 | 2016-03-17 22:42:22 +0100 | [diff] [blame] | 142 | ); |
| 143 | |
| Akron | 6396c30 | 2016-03-18 16:05:39 +0100 | [diff] [blame] | 144 | # Associate meta object |
| Akron | 35db6e3 | 2016-03-17 22:42:22 +0100 | [diff] [blame] | 145 | $self->{meta} = $meta; |
| 146 | }; |
| 147 | |
| Akron | 6396c30 | 2016-03-18 16:05:39 +0100 | [diff] [blame] | 148 | unless ($meta) { |
| 149 | $self->log->warn( |
| 150 | "Metadata object for $meta_data_type not initializable" |
| 151 | ); |
| 152 | }; |
| Akron | 35db6e3 | 2016-03-17 22:42:22 +0100 | [diff] [blame] | 153 | |
| Nils Diewald | 840c924 | 2014-10-28 19:51:26 +0000 | [diff] [blame] | 154 | my @type = qw/corpus doc text/; |
| 155 | foreach (@header) { |
| 156 | # Get corpus, doc and text meta data |
| 157 | my $type = shift(@type); |
| Akron | a866578 | 2016-01-27 21:47:57 +0100 | [diff] [blame] | 158 | |
| Akron | 11c8030 | 2016-03-18 19:44:43 +0100 | [diff] [blame] | 159 | # Check for cache |
| 160 | next if $meta->is_cached($type); |
| 161 | |
| Akron | a866578 | 2016-01-27 21:47:57 +0100 | [diff] [blame] | 162 | next unless -e $_; |
| 163 | |
| Akron | 35db6e3 | 2016-03-17 22:42:22 +0100 | [diff] [blame] | 164 | # Slurp data and probably decode |
| Akron | 3ec0a1c | 2017-01-18 14:41:55 +0100 | [diff] [blame] | 165 | my $slurp = b(Mojo::File->new($_)->slurp); |
| Akron | 6396c30 | 2016-03-18 16:05:39 +0100 | [diff] [blame] | 166 | $slurp =~ $ENC_RE; |
| Akron | a866578 | 2016-01-27 21:47:57 +0100 | [diff] [blame] | 167 | my $file = $slurp->decode($2 // 'UTF-8'); |
| 168 | |
| 169 | # Get DOM |
| 170 | my $dom = Mojo::DOM->new($file); |
| 171 | |
| Akron | 6396c30 | 2016-03-18 16:05:39 +0100 | [diff] [blame] | 172 | # Parse object based on DOM |
| Akron | 35db6e3 | 2016-03-17 22:42:22 +0100 | [diff] [blame] | 173 | $meta->parse($dom, $type); |
| Akron | 11c8030 | 2016-03-18 19:44:43 +0100 | [diff] [blame] | 174 | $meta->do_cache($type); |
| Nils Diewald | 840c924 | 2014-10-28 19:51:26 +0000 | [diff] [blame] | 175 | }; |
| 176 | |
| Akron | 14ca9f0 | 2016-01-29 19:38:18 +0100 | [diff] [blame] | 177 | return $self; |
| 178 | }; |
| 179 | |
| 180 | |
| 181 | sub tokenize { |
| 182 | my $self = shift; |
| 183 | my ($token_foundry, $token_layer) = @_; |
| 184 | |
| 185 | $token_foundry //= 'OpenNLP'; |
| 186 | $token_layer //= 'Tokens'; |
| 187 | |
| Akron | 941c1a6 | 2016-02-23 17:41:41 +0100 | [diff] [blame] | 188 | # Create tokenizer |
| 189 | my $tokens = KorAP::XML::Tokenizer->new( |
| Akron | 14ca9f0 | 2016-01-29 19:38:18 +0100 | [diff] [blame] | 190 | path => $self->path, |
| 191 | doc => $self, |
| 192 | foundry => $token_foundry, |
| 193 | layer => $token_layer, |
| 194 | name => 'tokens' |
| 195 | ); |
| 196 | |
| Akron | 941c1a6 | 2016-02-23 17:41:41 +0100 | [diff] [blame] | 197 | # Parse tokens |
| Akron | 14ca9f0 | 2016-01-29 19:38:18 +0100 | [diff] [blame] | 198 | unless ($tokens->parse) { |
| 199 | $self->log->warn( |
| 200 | 'Unable to tokenize ' . $self->path . |
| 201 | ' with ' . $token_foundry . '#' |
| 202 | . $token_layer |
| 203 | ); |
| 204 | } |
| 205 | else { |
| 206 | weaken $self; |
| 207 | $self->{tokenizer} = $tokens; |
| 208 | }; |
| 209 | |
| 210 | return $self; |
| 211 | }; |
| 212 | |
| 213 | |
| 214 | # Add annotation |
| 215 | sub annotate { |
| 216 | my $self = shift; |
| 217 | unless ($self->{tokenizer}) { |
| 218 | $self->log->warn('No tokenizer defined') |
| 219 | } |
| 220 | else { |
| 221 | $self->{tokenizer}->add(@_); |
| 222 | }; |
| 223 | |
| 224 | $self; |
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 225 | }; |
| 226 | |
| 227 | |
| Akron | a866578 | 2016-01-27 21:47:57 +0100 | [diff] [blame] | 228 | # Store arbitrary data |
| 229 | sub store { |
| 230 | my $self = shift; |
| 231 | return $self->{store} unless @_; |
| 232 | return $self->{store}->{$_[0]} if @_ == 1; |
| 233 | $self->{store}->{$_[0]} = $_[1]; |
| 234 | }; |
| 235 | |
| 236 | |
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 237 | # Primary data |
| 238 | sub primary { |
| 239 | $_[0]->{pd}; |
| 240 | }; |
| 241 | |
| Akron | 35db6e3 | 2016-03-17 22:42:22 +0100 | [diff] [blame] | 242 | sub meta { |
| 243 | return $_[0]->{meta}; |
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 244 | }; |
| 245 | |
| Akron | 35db6e3 | 2016-03-17 22:42:22 +0100 | [diff] [blame] | 246 | sub to_hash { |
| Nils Diewald | 840c924 | 2014-10-28 19:51:26 +0000 | [diff] [blame] | 247 | my $self = shift; |
| Nils Diewald | 840c924 | 2014-10-28 19:51:26 +0000 | [diff] [blame] | 248 | |
| Akron | 35db6e3 | 2016-03-17 22:42:22 +0100 | [diff] [blame] | 249 | $self->parse unless $self->text_sigle; |
| Nils Diewald | feccbb1 | 2015-06-18 20:06:45 +0000 | [diff] [blame] | 250 | |
| Akron | 35db6e3 | 2016-03-17 22:42:22 +0100 | [diff] [blame] | 251 | my %hash; |
| Nils Diewald | feccbb1 | 2015-06-18 20:06:45 +0000 | [diff] [blame] | 252 | |
| Akron | 35db6e3 | 2016-03-17 22:42:22 +0100 | [diff] [blame] | 253 | # Get meta object |
| 254 | my $meta = $self->meta; |
| Akron | 11c8030 | 2016-03-18 19:44:43 +0100 | [diff] [blame] | 255 | foreach ($meta->keys) { |
| Nils Diewald | 840c924 | 2014-10-28 19:51:26 +0000 | [diff] [blame] | 256 | |
| Akron | 35db6e3 | 2016-03-17 22:42:22 +0100 | [diff] [blame] | 257 | my $v = $meta->{$_}; |
| 258 | if (ref $v) { |
| 259 | $hash{_k($_)} = $meta->keywords($_); |
| Nils Diewald | 90410c2 | 2014-11-03 21:04:05 +0000 | [diff] [blame] | 260 | } |
| Akron | 35db6e3 | 2016-03-17 22:42:22 +0100 | [diff] [blame] | 261 | else { |
| 262 | $v =~ s/\n/ /g; |
| 263 | $v =~ s/\s\s+/ /g; |
| 264 | $hash{_k($_)} = $v; |
| Nils Diewald | 840c924 | 2014-10-28 19:51:26 +0000 | [diff] [blame] | 265 | }; |
| Nils Diewald | 90410c2 | 2014-11-03 21:04:05 +0000 | [diff] [blame] | 266 | }; |
| Nils Diewald | 840c924 | 2014-10-28 19:51:26 +0000 | [diff] [blame] | 267 | |
| Akron | 35db6e3 | 2016-03-17 22:42:22 +0100 | [diff] [blame] | 268 | foreach (qw/corpus doc text/) { |
| 269 | $hash{$_ . 'Sigle'} = $self->{$_ . '_sigle'}; |
| Nils Diewald | 8e323ee | 2014-04-23 17:28:14 +0000 | [diff] [blame] | 270 | }; |
| 271 | |
| Akron | 35db6e3 | 2016-03-17 22:42:22 +0100 | [diff] [blame] | 272 | return \%hash; |
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 273 | }; |
| 274 | |
| Nils Diewald | 840c924 | 2014-10-28 19:51:26 +0000 | [diff] [blame] | 275 | |
| Akron | 35db6e3 | 2016-03-17 22:42:22 +0100 | [diff] [blame] | 276 | sub _k { |
| 277 | my $x = $_[0]; |
| 278 | $x =~ s/_(\w)/\U$1\E/g; |
| 279 | $x =~ s/id$/ID/gi; |
| 280 | return $x; |
| 281 | }; |
| 282 | |
| 283 | |
| 284 | sub to_json { |
| 285 | my $self = shift; |
| 286 | unless ($self->{tokenizer}) { |
| 287 | $self->log->warn('No tokenizer defined'); |
| 288 | return; |
| 289 | }; |
| 290 | |
| 291 | return $self->{tokenizer}->to_json; |
| 292 | }; |
| 293 | |
| 294 | |
| 295 | 1; |
| 296 | |
| 297 | |
| 298 | __END__ |
| Nils Diewald | feccbb1 | 2015-06-18 20:06:45 +0000 | [diff] [blame] | 299 | |
| Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 300 | sub to_string { |
| 301 | my $self = shift; |
| 302 | |
| 303 | my $string; |
| 304 | |
| 305 | foreach (@ATTR) { |
| 306 | if (my $att = $self->$_) { |
| 307 | $att =~ s/\n/ /g; |
| 308 | $att =~ s/\s\s+/ /g; |
| 309 | $string .= $_ . ' = ' . $att . "\n"; |
| 310 | }; |
| 311 | }; |
| 312 | |
| Akron | 31d788e | 2016-02-05 20:49:03 +0100 | [diff] [blame] | 313 | $string .= 'text_class = ' . $self->text_class_string . "\n"; |
| 314 | $string .= 'keywords = ' . $self->keywords_string . "\n"; |
| Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 315 | |
| 316 | return $string; |
| 317 | }; |
| 318 | |
| Akron | 14ca9f0 | 2016-01-29 19:38:18 +0100 | [diff] [blame] | 319 | # Todo: Make this a KoralQuery serializer |
| 320 | sub to_koral_query { |
| 321 | my $self = shift; |
| Akron | 941c1a6 | 2016-02-23 17:41:41 +0100 | [diff] [blame] | 322 | my $hash = {}; |
| 323 | $hash->{'@context'} = 'http://korap.ids-mannheim.de/ns/koral/0.4/context.jsonld'; |
| 324 | $hash->{'@type'} = 'koral:corpus'; |
| 325 | # $hash->{'text'} = $self->primary->data; |
| 326 | # my $hash = $self->to_hash; |
| Akron | 14ca9f0 | 2016-01-29 19:38:18 +0100 | [diff] [blame] | 327 | }; |
| Nils Diewald | 7b84722 | 2014-04-23 11:14:00 +0000 | [diff] [blame] | 328 | |
| Akron | 941c1a6 | 2016-02-23 17:41:41 +0100 | [diff] [blame] | 329 | |
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 330 | 1; |
| 331 | |
| 332 | |
| 333 | __END__ |
| 334 | |
| 335 | =pod |
| 336 | |
| Akron | 31d788e | 2016-02-05 20:49:03 +0100 | [diff] [blame] | 337 | =encoding utf8 |
| 338 | |
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 339 | =head1 NAME |
| 340 | |
| Akron | 31d788e | 2016-02-05 20:49:03 +0100 | [diff] [blame] | 341 | KorAP::XML::Krill - Preprocess KorAP XML documents for Krill |
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 342 | |
| 343 | |
| 344 | =head1 SYNOPSIS |
| 345 | |
| Akron | 31d788e | 2016-02-05 20:49:03 +0100 | [diff] [blame] | 346 | # Create Converter Object |
| Akron | e4c2e41 | 2016-01-28 15:10:50 +0100 | [diff] [blame] | 347 | my $doc = KorAP::XML::Krill->new( |
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 348 | path => 'mydoc-1/' |
| 349 | ); |
| 350 | |
| Akron | 31d788e | 2016-02-05 20:49:03 +0100 | [diff] [blame] | 351 | # Convert to krill json |
| 352 | print $doc->parse->tokenize->annotate('Mate', 'Morpho')->to_json; |
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 353 | |
| 354 | |
| 355 | =head1 DESCRIPTION |
| 356 | |
| Akron | 31d788e | 2016-02-05 20:49:03 +0100 | [diff] [blame] | 357 | Parse the primary and meta data of a KorAP-XML document. |
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 358 | |
| 359 | |
| Akron | 31d788e | 2016-02-05 20:49:03 +0100 | [diff] [blame] | 360 | =head1 ATTRIBUTES |
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 361 | |
| Akron | 31d788e | 2016-02-05 20:49:03 +0100 | [diff] [blame] | 362 | =head2 log |
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 363 | |
| Akron | 31d788e | 2016-02-05 20:49:03 +0100 | [diff] [blame] | 364 | L<Log::Log4perl> object for logging. |
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 365 | |
| 366 | =head2 path |
| 367 | |
| 368 | $doc->path("example-004/"); |
| 369 | print $doc->path; |
| 370 | |
| 371 | The path of the document. |
| 372 | |
| 373 | |
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 374 | =head2 primary |
| 375 | |
| 376 | print $doc->primary->data(0,20); |
| 377 | |
| Akron | e4c2e41 | 2016-01-28 15:10:50 +0100 | [diff] [blame] | 378 | The L<KorAP::XML::Document::Primary> object containing the primary data. |
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 379 | |
| 380 | |
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 381 | =head1 METHODS |
| 382 | |
| Akron | 31d788e | 2016-02-05 20:49:03 +0100 | [diff] [blame] | 383 | =head2 annotate |
| 384 | |
| Akron | a5920b1 | 2016-06-29 18:51:21 +0200 | [diff] [blame] | 385 | $doc->annotate('Mate', 'Morpho'); |
| Akron | 31d788e | 2016-02-05 20:49:03 +0100 | [diff] [blame] | 386 | |
| 387 | Add annotation layer to conversion process. |
| 388 | |
| 389 | |
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 390 | =head2 parse |
| 391 | |
| Akron | 31d788e | 2016-02-05 20:49:03 +0100 | [diff] [blame] | 392 | $doc = $doc->parse; |
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 393 | |
| Akron | 31d788e | 2016-02-05 20:49:03 +0100 | [diff] [blame] | 394 | Run the meta parsing process of the document. |
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 395 | |
| 396 | |
| Akron | 31d788e | 2016-02-05 20:49:03 +0100 | [diff] [blame] | 397 | =head2 tokenize |
| 398 | |
| 399 | $doc = $doc->tokenize('OpenNLP', 'Tokens'); |
| 400 | |
| 401 | Accept the tokenization based on a given foundry and a given layer. |
| 402 | |
| 403 | |
| 404 | =head1 AVAILABILITY |
| 405 | |
| 406 | https://github.com/KorAP/KorAP-XML-Krill |
| 407 | |
| 408 | |
| 409 | =head1 COPYRIGHT AND LICENSE |
| 410 | |
| Akron | 3ec0a1c | 2017-01-18 14:41:55 +0100 | [diff] [blame] | 411 | Copyright (C) 2015-2017, L<IDS Mannheim|http://www.ids-mannheim.de/> |
| Akron | 31d788e | 2016-02-05 20:49:03 +0100 | [diff] [blame] | 412 | Author: L<Nils Diewald|http://nils-diewald.de/> |
| 413 | |
| 414 | KorAP::XML::Krill is developed as part of the |
| 415 | L<KorAP|http://korap.ids-mannheim.de/> |
| 416 | Corpus Analysis Platform at the |
| 417 | L<Institute for the German Language (IDS)|http://ids-mannheim.de/>, |
| 418 | member of the |
| 419 | L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/en/about-us/leibniz-competition/projekte-2011/2011-funding-line-2/> |
| 420 | and supported by the L<KobRA|http://www.kobra.tu-dortmund.de> project, |
| 421 | funded by the |
| 422 | L<Federal Ministry of Education and Research (BMBF)|http://www.bmbf.de/en/>. |
| 423 | |
| 424 | KorAP::XML::Krill is free software published under the |
| 425 | L<BSD-2 License|https://raw.githubusercontent.com/KorAP/KorAP-XML-Krill/master/LICENSE>. |
| 426 | |
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 427 | =cut |