Akron | c13a170 | 2016-03-15 19:33:14 +0100 | [diff] [blame] | 1 | =pod |
2 | |||||
3 | =encoding utf8 | ||||
4 | |||||
5 | =head1 NAME | ||||
6 | |||||
Akron | 2fd402b | 2016-10-27 21:26:48 +0200 | [diff] [blame] | 7 | KorAP::XML::Krill - Preprocess KorAP XML documents for Krill |
Akron | c13a170 | 2016-03-15 19:33:14 +0100 | [diff] [blame] | 8 | |
9 | |||||
10 | =head1 SYNOPSIS | ||||
11 | |||||
Akron | 2fd402b | 2016-10-27 21:26:48 +0200 | [diff] [blame] | 12 | # Create Converter Object |
13 | my $doc = KorAP::XML::Krill->new( | ||||
14 | path => 'mydoc-1/' | ||||
15 | ); | ||||
16 | |||||
17 | # Convert to krill json | ||||
18 | print $doc->parse->tokenize->annotate('Mate', 'Morpho')->to_json; | ||||
19 | |||||
Akron | c13a170 | 2016-03-15 19:33:14 +0100 | [diff] [blame] | 20 | |
21 | =head1 DESCRIPTION | ||||
22 | |||||
Akron | 2fd402b | 2016-10-27 21:26:48 +0200 | [diff] [blame] | 23 | Parse the primary and meta data of a KorAP-XML document. |
Akron | c13a170 | 2016-03-15 19:33:14 +0100 | [diff] [blame] | 24 | |
25 | |||||
Akron | 2fd402b | 2016-10-27 21:26:48 +0200 | [diff] [blame] | 26 | =head1 ATTRIBUTES |
Akron | c13a170 | 2016-03-15 19:33:14 +0100 | [diff] [blame] | 27 | |
Akron | 2fd402b | 2016-10-27 21:26:48 +0200 | [diff] [blame] | 28 | =head2 log |
Akron | c13a170 | 2016-03-15 19:33:14 +0100 | [diff] [blame] | 29 | |
Akron | 2fd402b | 2016-10-27 21:26:48 +0200 | [diff] [blame] | 30 | L<Log::Log4perl> object for logging. |
Akron | c13a170 | 2016-03-15 19:33:14 +0100 | [diff] [blame] | 31 | |
Akron | 2fd402b | 2016-10-27 21:26:48 +0200 | [diff] [blame] | 32 | =head2 path |
Akron | c13a170 | 2016-03-15 19:33:14 +0100 | [diff] [blame] | 33 | |
Akron | 2fd402b | 2016-10-27 21:26:48 +0200 | [diff] [blame] | 34 | $doc->path("example-004/"); |
35 | print $doc->path; | ||||
Akron | c13a170 | 2016-03-15 19:33:14 +0100 | [diff] [blame] | 36 | |
Akron | 2fd402b | 2016-10-27 21:26:48 +0200 | [diff] [blame] | 37 | The path of the document. |
Akron | a76d835 | 2016-10-27 16:27:32 +0200 | [diff] [blame] | 38 | |
Akron | 7606afa | 2016-10-25 16:23:49 +0200 | [diff] [blame] | 39 | |
Akron | 2fd402b | 2016-10-27 21:26:48 +0200 | [diff] [blame] | 40 | =head2 primary |
Akron | c13a170 | 2016-03-15 19:33:14 +0100 | [diff] [blame] | 41 | |
Akron | 2fd402b | 2016-10-27 21:26:48 +0200 | [diff] [blame] | 42 | print $doc->primary->data(0,20); |
Akron | c13a170 | 2016-03-15 19:33:14 +0100 | [diff] [blame] | 43 | |
Akron | 2fd402b | 2016-10-27 21:26:48 +0200 | [diff] [blame] | 44 | The L<KorAP::XML::Document::Primary> object containing the primary data. |
Akron | a76d835 | 2016-10-27 16:27:32 +0200 | [diff] [blame] | 45 | |
Akron | c13a170 | 2016-03-15 19:33:14 +0100 | [diff] [blame] | 46 | |
Akron | 2fd402b | 2016-10-27 21:26:48 +0200 | [diff] [blame] | 47 | =head1 METHODS |
Akron | c13a170 | 2016-03-15 19:33:14 +0100 | [diff] [blame] | 48 | |
Akron | 2fd402b | 2016-10-27 21:26:48 +0200 | [diff] [blame] | 49 | =head2 annotate |
Akron | a76d835 | 2016-10-27 16:27:32 +0200 | [diff] [blame] | 50 | |
Akron | 2fd402b | 2016-10-27 21:26:48 +0200 | [diff] [blame] | 51 | $doc->annotate('Mate', 'Morpho'); |
Akron | c13a170 | 2016-03-15 19:33:14 +0100 | [diff] [blame] | 52 | |
Akron | 2fd402b | 2016-10-27 21:26:48 +0200 | [diff] [blame] | 53 | Add annotation layer to conversion process. |
Akron | c13a170 | 2016-03-15 19:33:14 +0100 | [diff] [blame] | 54 | |
55 | |||||
Akron | 2fd402b | 2016-10-27 21:26:48 +0200 | [diff] [blame] | 56 | =head2 parse |
Akron | c13a170 | 2016-03-15 19:33:14 +0100 | [diff] [blame] | 57 | |
Akron | 2fd402b | 2016-10-27 21:26:48 +0200 | [diff] [blame] | 58 | $doc = $doc->parse; |
Akron | c13a170 | 2016-03-15 19:33:14 +0100 | [diff] [blame] | 59 | |
Akron | 2fd402b | 2016-10-27 21:26:48 +0200 | [diff] [blame] | 60 | Run the meta parsing process of the document. |
Akron | c13a170 | 2016-03-15 19:33:14 +0100 | [diff] [blame] | 61 | |
Akron | c13a170 | 2016-03-15 19:33:14 +0100 | [diff] [blame] | 62 | |
Akron | 2fd402b | 2016-10-27 21:26:48 +0200 | [diff] [blame] | 63 | =head2 tokenize |
Akron | 7606afa | 2016-10-25 16:23:49 +0200 | [diff] [blame] | 64 | |
Akron | 2fd402b | 2016-10-27 21:26:48 +0200 | [diff] [blame] | 65 | $doc = $doc->tokenize('OpenNLP', 'Tokens'); |
Akron | a5920b1 | 2016-06-29 18:51:21 +0200 | [diff] [blame] | 66 | |
Akron | 2fd402b | 2016-10-27 21:26:48 +0200 | [diff] [blame] | 67 | Accept the tokenization based on a given foundry and a given layer. |
Akron | c13a170 | 2016-03-15 19:33:14 +0100 | [diff] [blame] | 68 | |
Akron | c13a170 | 2016-03-15 19:33:14 +0100 | [diff] [blame] | 69 | |
70 | =head1 AVAILABILITY | ||||
71 | |||||
72 | https://github.com/KorAP/KorAP-XML-Krill | ||||
73 | |||||
74 | |||||
75 | =head1 COPYRIGHT AND LICENSE | ||||
76 | |||||
77 | Copyright (C) 2015-2016, L<IDS Mannheim|http://www.ids-mannheim.de/> | ||||
Akron | c13a170 | 2016-03-15 19:33:14 +0100 | [diff] [blame] | 78 | Author: L<Nils Diewald|http://nils-diewald.de/> |
Akron | c13a170 | 2016-03-15 19:33:14 +0100 | [diff] [blame] | 79 | |
Akron | 2fd402b | 2016-10-27 21:26:48 +0200 | [diff] [blame] | 80 | KorAP::XML::Krill is developed as part of the |
81 | L<KorAP|http://korap.ids-mannheim.de/> | ||||
Akron | c13a170 | 2016-03-15 19:33:14 +0100 | [diff] [blame] | 82 | Corpus Analysis Platform at the |
83 | L<Institute for the German Language (IDS)|http://ids-mannheim.de/>, | ||||
84 | member of the | ||||
Akron | 2fd402b | 2016-10-27 21:26:48 +0200 | [diff] [blame] | 85 | L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/en/about-us/leibniz-competition/projekte-2011/2011-funding-line-2/> |
86 | and supported by the L<KobRA|http://www.kobra.tu-dortmund.de> project, | ||||
87 | funded by the | ||||
88 | L<Federal Ministry of Education and Research (BMBF)|http://www.bmbf.de/en/>. | ||||
Akron | c13a170 | 2016-03-15 19:33:14 +0100 | [diff] [blame] | 89 | |
Akron | 2fd402b | 2016-10-27 21:26:48 +0200 | [diff] [blame] | 90 | KorAP::XML::Krill is free software published under the |
Akron | c13a170 | 2016-03-15 19:33:14 +0100 | [diff] [blame] | 91 | L<BSD-2 License|https://raw.githubusercontent.com/KorAP/KorAP-XML-Krill/master/LICENSE>. |
92 | |||||
93 | =cut |