blob: 68e486a81aa653a07c40ec95e1dd8c31861d1063 [file] [log] [blame]
#!/usr/bin/env perl
# source ~/perl5/perlbrew/etc/bashrc
# perlbrew switch perl-blead@korap
use strict;
use warnings;
use Mojo::ByteStream 'b';
use Devel::Cycle;
use Memory::Stats;
use Benchmark qw/:hireswallclock/;
my $t = Benchmark->new;
use utf8;
use lib 'lib', '../lib';
use File::Basename 'dirname';
use File::Spec::Functions 'catdir';
# Tokenization
use KorAP::Tokenizer;
use KorAP::Document;
# my $stats = Memory::Stats->new;
#$stats->start;
# GOE/AGA/03828
#my $path = catdir(dirname(__FILE__), 'GOE/AGA/03828');
my $path = catdir(dirname(__FILE__), 'BZK/D59/00089');
# Todo: Test with absolute path!
# do something
#$stats->checkpoint(sprintf("%20s", "Init"));
my $doc = KorAP::Document->new( path => $path . '/' );
$doc->parse;
# $stats->checkpoint(sprintf("%20s", "After Parsing"));
my ($token_base_foundry, $token_base_layer) = (qw/OpenNLP Tokens/);
# Get tokenization
my $tokens = KorAP::Tokenizer->new(
path => $doc->path,
doc => $doc,
foundry => $token_base_foundry,
layer => $token_base_layer,
name => 'tokens'
);
$tokens->parse;
#$stats->checkpoint(sprintf("%20s", "After Tokenization"));
$tokens->add('Base', 'Sentences');
#$stats->checkpoint(sprintf("%20s", "After Base/Sentences"));
$tokens->add('Base', 'Paragraphs');
#$stats->checkpoint(sprintf("%20s", "After Base/Paragraphs"));
$tokens->add('OpenNLP', 'Sentences');
#$stats->checkpoint(sprintf("%20s", "After OpenNLP/Sentences"));
$tokens->add('OpenNLP', 'Morpho');
#$stats->checkpoint(sprintf("%20s", "After OpenNLP/Morpho"));
$tokens->add('TreeTagger', 'Sentences');
#$stats->checkpoint(sprintf("%20s", "After TT/Sentences"));
$tokens->add('TreeTagger', 'Morpho');
#$stats->checkpoint(sprintf("%20s", "After TT/Morpho"));
$tokens->add('CoreNLP', 'Sentences');
#$stats->checkpoint(sprintf("%20s", "After CoreNLP/Sentences"));
$tokens->add('CoreNLP', 'Constituency');
#$stats->checkpoint(sprintf("%20s", "After CoreNLP/Constituency"));
#$stats->stop;
#$stats->report;
$tokens->add('CoreNLP', 'NamedEntities');
$tokens->add('CoreNLP', 'Morpho');
$tokens->add('Glemm', 'Morpho');
# t ok($tokens->add('Connexor', 'Sentences'), 'Add cnx sentences');
# t ok($tokens->add('Connexor', 'Morpho'), 'Add cnx morpho');
# t ok($tokens->add('Connexor', 'Phrase'), 'Add cnx phrase');
# t ok($tokens->add('Connexor', 'Syntax'), 'Add cnx syntax');
$tokens->add('Mate', 'Morpho');
# $tokens->add('Mate', 'Dependency');
# t ok($tokens->add('XIP', 'Sentences'), 'Add xip sentences');
# t ok($tokens->add('XIP', 'Morpho'), 'Add xip morpho');
# t ok($tokens->add('XIP', 'Constituency'), 'Add xip constituency');
# $tokens->add('XIP', 'Dependency');
# ok($tokens->to_json, 'To json');
#b($tokens->to_json)->spurt('AGA-03828.json');
b($tokens->to_json)->spurt('D59-00089.json');
# timestr(timediff(Benchmark->new, $t));