| Akron | ed9baf0 | 2019-01-22 17:03:25 +0100 | [diff] [blame] | 1 | #!/usr/bin/env perl | 
 | 2 | use strict; | 
 | 3 | use warnings; | 
 | 4 | use utf8; | 
 | 5 | use Test::More; | 
 | 6 | use JSON::XS; | 
| Akron | 263274c | 2019-02-07 09:48:30 +0100 | [diff] [blame] | 7 | use Data::Dumper; | 
| Akron | ed9baf0 | 2019-01-22 17:03:25 +0100 | [diff] [blame] | 8 |  | 
 | 9 | use File::Basename 'dirname'; | 
 | 10 | use File::Spec::Functions 'catdir'; | 
 | 11 |  | 
 | 12 | sub _t2h { | 
 | 13 |   my $string = shift; | 
 | 14 |   $string =~ s/^\[\(\d+?-\d+?\)(.+?)\]$/$1/; | 
 | 15 |   my %hash = (); | 
 | 16 |   foreach (split(qr!\|!, $string)) { | 
 | 17 |     $hash{$_} = 1; | 
 | 18 |   }; | 
 | 19 |   return \%hash; | 
 | 20 | }; | 
 | 21 |  | 
 | 22 | use_ok('KorAP::XML::Krill'); | 
 | 23 |  | 
 | 24 | my $path = catdir(dirname(__FILE__), 'corpus/WPD/00001'); | 
 | 25 | ok(my $doc = KorAP::XML::Krill->new( path => $path ), 'Load Korap::Document'); | 
 | 26 | like($doc->path, qr!\Q$path\E/$!, 'Path'); | 
 | 27 | ok($doc->parse, 'Parse document'); | 
 | 28 | is($doc->text_sigle, 'WPD/AAA/00001', 'ID'); | 
 | 29 |  | 
 | 30 |  | 
 | 31 | # Get tokens | 
 | 32 | use_ok('KorAP::XML::Tokenizer'); | 
 | 33 |  | 
 | 34 | # Get tokenization | 
 | 35 | ok(my $tokens = KorAP::XML::Tokenizer->new( | 
 | 36 |   path => $doc->path, | 
 | 37 |   doc => $doc, | 
 | 38 |   foundry => 'OpenNLP', | 
 | 39 |   layer => 'Tokens', | 
 | 40 |   name => 'tokens' | 
 | 41 | ), 'New Tokenizer'); | 
 | 42 | ok($tokens->parse, 'Parse'); | 
 | 43 |  | 
 | 44 | like($tokens->stream->pos(12)->to_string, qr/s:Vokal/); | 
 | 45 | like($tokens->stream->pos(13)->to_string, qr/s:Der/); | 
 | 46 |  | 
 | 47 |  | 
 | 48 | # Get tokenization with non word tokens | 
 | 49 | ok($tokens = KorAP::XML::Tokenizer->new( | 
 | 50 |   path => $doc->path, | 
 | 51 |   doc => $doc, | 
 | 52 |   foundry => 'OpenNLP', | 
 | 53 |   layer => 'Tokens', | 
 | 54 |   name => 'tokens', | 
 | 55 |   non_word_tokens => 1 | 
 | 56 | ), 'New Tokenizer'); | 
 | 57 | ok($tokens->parse, 'Parse'); | 
 | 58 |  | 
 | 59 | like($tokens->stream->pos(12)->to_string, qr/s:Vokal/); | 
 | 60 | like($tokens->stream->pos(13)->to_string, qr/s:\./); | 
 | 61 | like($tokens->stream->pos(14)->to_string, qr/s:Der/); | 
 | 62 |  | 
 | 63 |  | 
| Akron | c526e75 | 2019-02-05 14:57:17 +0100 | [diff] [blame] | 64 | my $json = decode_json $tokens->to_json; | 
 | 65 | is($json->{docSigle}, 'WPD/AAA', 'DocSigle old'); | 
 | 66 | is($json->{author}, 'Ruru; Jens.Ol; Aglarech; u.a.', 'author old'); | 
 | 67 |  | 
 | 68 | $json = decode_json $tokens->to_json(0.4); | 
 | 69 | is($json->{fields}->[0]->{key}, 'corpusSigle'); | 
 | 70 | is($json->{fields}->[0]->{value}, 'WPD'); | 
 | 71 | is($json->{fields}->[6]->{key}, 'creationDate'); | 
 | 72 | is($json->{fields}->[6]->{value}, '2005'); | 
 | 73 |  | 
| Akron | 263274c | 2019-02-07 09:48:30 +0100 | [diff] [blame] | 74 | is($json->{data}->{name}, 'tokens'); | 
 | 75 | is($json->{data}->{tokenSource}, 'opennlp#tokens'); | 
 | 76 | is($json->{data}->{stream}->[0]->[1], '<>:base/s:t$<b>64<i>0<i>6083<i>1067<b>0'); | 
 | 77 |  | 
| Akron | ed9baf0 | 2019-01-22 17:03:25 +0100 | [diff] [blame] | 78 | done_testing; | 
 | 79 |  | 
 | 80 | __END__ |