blob: 59253e73f08766cd79965cf9c0e8bf886ebf665c [file] [log] [blame]
Akronc13a1702016-03-15 19:33:14 +01001=pod
2
3=encoding utf8
4
5=head1 NAME
6
Akron2fd402b2016-10-27 21:26:48 +02007KorAP::XML::Krill - Preprocess KorAP XML documents for Krill
Akronc13a1702016-03-15 19:33:14 +01008
9
10=head1 SYNOPSIS
11
Akron2fd402b2016-10-27 21:26:48 +020012 # Create Converter Object
13 my $doc = KorAP::XML::Krill->new(
14 path => 'mydoc-1/'
15 );
16
17 # Convert to krill json
18 print $doc->parse->tokenize->annotate('Mate', 'Morpho')->to_json;
19
Akronc13a1702016-03-15 19:33:14 +010020
21=head1 DESCRIPTION
22
Akron2fd402b2016-10-27 21:26:48 +020023Parse the primary and meta data of a KorAP-XML document.
Akronc13a1702016-03-15 19:33:14 +010024
25
Akron2fd402b2016-10-27 21:26:48 +020026=head1 ATTRIBUTES
Akronc13a1702016-03-15 19:33:14 +010027
Akron2fd402b2016-10-27 21:26:48 +020028=head2 log
Akronc13a1702016-03-15 19:33:14 +010029
Akron2fd402b2016-10-27 21:26:48 +020030L<Log::Log4perl> object for logging.
Akronc13a1702016-03-15 19:33:14 +010031
Akron2fd402b2016-10-27 21:26:48 +020032=head2 path
Akronc13a1702016-03-15 19:33:14 +010033
Akron2fd402b2016-10-27 21:26:48 +020034 $doc->path("example-004/");
35 print $doc->path;
Akronc13a1702016-03-15 19:33:14 +010036
Akron2fd402b2016-10-27 21:26:48 +020037The path of the document.
Akrona76d8352016-10-27 16:27:32 +020038
Akron7606afa2016-10-25 16:23:49 +020039
Akron2fd402b2016-10-27 21:26:48 +020040=head2 primary
Akronc13a1702016-03-15 19:33:14 +010041
Akron2fd402b2016-10-27 21:26:48 +020042 print $doc->primary->data(0,20);
Akronc13a1702016-03-15 19:33:14 +010043
Akron2fd402b2016-10-27 21:26:48 +020044The L<KorAP::XML::Document::Primary> object containing the primary data.
Akrona76d8352016-10-27 16:27:32 +020045
Akronc13a1702016-03-15 19:33:14 +010046
Akron2fd402b2016-10-27 21:26:48 +020047=head1 METHODS
Akronc13a1702016-03-15 19:33:14 +010048
Akron2fd402b2016-10-27 21:26:48 +020049=head2 annotate
Akrona76d8352016-10-27 16:27:32 +020050
Akron2fd402b2016-10-27 21:26:48 +020051 $doc->annotate('Mate', 'Morpho');
Akronc13a1702016-03-15 19:33:14 +010052
Akron2fd402b2016-10-27 21:26:48 +020053Add annotation layer to conversion process.
Akronc13a1702016-03-15 19:33:14 +010054
55
Akron2fd402b2016-10-27 21:26:48 +020056=head2 parse
Akronc13a1702016-03-15 19:33:14 +010057
Akron2fd402b2016-10-27 21:26:48 +020058 $doc = $doc->parse;
Akronc13a1702016-03-15 19:33:14 +010059
Akron2fd402b2016-10-27 21:26:48 +020060Run the meta parsing process of the document.
Akronc13a1702016-03-15 19:33:14 +010061
Akronc13a1702016-03-15 19:33:14 +010062
Akron2fd402b2016-10-27 21:26:48 +020063=head2 tokenize
Akron7606afa2016-10-25 16:23:49 +020064
Akron2fd402b2016-10-27 21:26:48 +020065 $doc = $doc->tokenize('OpenNLP', 'Tokens');
Akrona5920b12016-06-29 18:51:21 +020066
Akron2fd402b2016-10-27 21:26:48 +020067Accept the tokenization based on a given foundry and a given layer.
Akronc13a1702016-03-15 19:33:14 +010068
Akronc13a1702016-03-15 19:33:14 +010069
70=head1 AVAILABILITY
71
72 https://github.com/KorAP/KorAP-XML-Krill
73
74
75=head1 COPYRIGHT AND LICENSE
76
77Copyright (C) 2015-2016, L<IDS Mannheim|http://www.ids-mannheim.de/>
Akronc13a1702016-03-15 19:33:14 +010078Author: L<Nils Diewald|http://nils-diewald.de/>
Akronc13a1702016-03-15 19:33:14 +010079
Akron2fd402b2016-10-27 21:26:48 +020080KorAP::XML::Krill is developed as part of the
81L<KorAP|http://korap.ids-mannheim.de/>
Akronc13a1702016-03-15 19:33:14 +010082Corpus Analysis Platform at the
83L<Institute for the German Language (IDS)|http://ids-mannheim.de/>,
84member of the
Akron2fd402b2016-10-27 21:26:48 +020085L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/en/about-us/leibniz-competition/projekte-2011/2011-funding-line-2/>
86and supported by the L<KobRA|http://www.kobra.tu-dortmund.de> project,
87funded by the
88L<Federal Ministry of Education and Research (BMBF)|http://www.bmbf.de/en/>.
Akronc13a1702016-03-15 19:33:14 +010089
Akron2fd402b2016-10-27 21:26:48 +020090KorAP::XML::Krill is free software published under the
Akronc13a1702016-03-15 19:33:14 +010091L<BSD-2 License|https://raw.githubusercontent.com/KorAP/KorAP-XML-Krill/master/LICENSE>.
92
93=cut