blob: 68e486a81aa653a07c40ec95e1dd8c31861d1063 [file] [log] [blame]
Nils Diewald90a23f22014-10-31 02:16:14 +00001#!/usr/bin/env perl
2# source ~/perl5/perlbrew/etc/bashrc
3# perlbrew switch perl-blead@korap
4use strict;
5use warnings;
Nils Diewalda0e8d722014-11-01 01:18:25 +00006use Mojo::ByteStream 'b';
Nils Diewaldc95607a2014-11-03 21:04:05 +00007use Devel::Cycle;
8use Memory::Stats;
Nils Diewald90a23f22014-10-31 02:16:14 +00009
10use Benchmark qw/:hireswallclock/;
11
12my $t = Benchmark->new;
13
14use utf8;
15use lib 'lib', '../lib';
16
Nils Diewald90a23f22014-10-31 02:16:14 +000017use File::Basename 'dirname';
18use File::Spec::Functions 'catdir';
19
Nils Diewaldc95607a2014-11-03 21:04:05 +000020# Tokenization
21use KorAP::Tokenizer;
22use KorAP::Document;
23
24# my $stats = Memory::Stats->new;
25
26#$stats->start;
Nils Diewald90a23f22014-10-31 02:16:14 +000027
28# GOE/AGA/03828
Nils Diewaldc95607a2014-11-03 21:04:05 +000029#my $path = catdir(dirname(__FILE__), 'GOE/AGA/03828');
30my $path = catdir(dirname(__FILE__), 'BZK/D59/00089');
Nils Diewalda0e8d722014-11-01 01:18:25 +000031# Todo: Test with absolute path!
Nils Diewald90a23f22014-10-31 02:16:14 +000032
Nils Diewaldc95607a2014-11-03 21:04:05 +000033# do something
34#$stats->checkpoint(sprintf("%20s", "Init"));
Nils Diewald90a23f22014-10-31 02:16:14 +000035
Nils Diewaldc95607a2014-11-03 21:04:05 +000036my $doc = KorAP::Document->new( path => $path . '/' );
37$doc->parse;
38# $stats->checkpoint(sprintf("%20s", "After Parsing"));
Nils Diewald90a23f22014-10-31 02:16:14 +000039
40my ($token_base_foundry, $token_base_layer) = (qw/OpenNLP Tokens/);
41
42# Get tokenization
43my $tokens = KorAP::Tokenizer->new(
44 path => $doc->path,
45 doc => $doc,
46 foundry => $token_base_foundry,
47 layer => $token_base_layer,
48 name => 'tokens'
49);
Nils Diewaldc95607a2014-11-03 21:04:05 +000050$tokens->parse;
51#$stats->checkpoint(sprintf("%20s", "After Tokenization"));
Nils Diewald24b04462014-11-01 00:16:38 +000052
Nils Diewaldc95607a2014-11-03 21:04:05 +000053$tokens->add('Base', 'Sentences');
54#$stats->checkpoint(sprintf("%20s", "After Base/Sentences"));
55
56$tokens->add('Base', 'Paragraphs');
57#$stats->checkpoint(sprintf("%20s", "After Base/Paragraphs"));
58
59$tokens->add('OpenNLP', 'Sentences');
60#$stats->checkpoint(sprintf("%20s", "After OpenNLP/Sentences"));
61
62$tokens->add('OpenNLP', 'Morpho');
63#$stats->checkpoint(sprintf("%20s", "After OpenNLP/Morpho"));
64
65$tokens->add('TreeTagger', 'Sentences');
66#$stats->checkpoint(sprintf("%20s", "After TT/Sentences"));
67
68$tokens->add('TreeTagger', 'Morpho');
69#$stats->checkpoint(sprintf("%20s", "After TT/Morpho"));
70
71$tokens->add('CoreNLP', 'Sentences');
72#$stats->checkpoint(sprintf("%20s", "After CoreNLP/Sentences"));
73
74$tokens->add('CoreNLP', 'Constituency');
75#$stats->checkpoint(sprintf("%20s", "After CoreNLP/Constituency"));
76
77#$stats->stop;
78#$stats->report;
79
80$tokens->add('CoreNLP', 'NamedEntities');
81$tokens->add('CoreNLP', 'Morpho');
82$tokens->add('Glemm', 'Morpho');
Nils Diewalda0e8d722014-11-01 01:18:25 +000083# t ok($tokens->add('Connexor', 'Sentences'), 'Add cnx sentences');
84# t ok($tokens->add('Connexor', 'Morpho'), 'Add cnx morpho');
85# t ok($tokens->add('Connexor', 'Phrase'), 'Add cnx phrase');
86# t ok($tokens->add('Connexor', 'Syntax'), 'Add cnx syntax');
Nils Diewaldc95607a2014-11-03 21:04:05 +000087$tokens->add('Mate', 'Morpho');
Nils Diewald90a23f22014-10-31 02:16:14 +000088# $tokens->add('Mate', 'Dependency');
Nils Diewalda0e8d722014-11-01 01:18:25 +000089# t ok($tokens->add('XIP', 'Sentences'), 'Add xip sentences');
90# t ok($tokens->add('XIP', 'Morpho'), 'Add xip morpho');
91# t ok($tokens->add('XIP', 'Constituency'), 'Add xip constituency');
Nils Diewald90a23f22014-10-31 02:16:14 +000092# $tokens->add('XIP', 'Dependency');
Nils Diewaldc95607a2014-11-03 21:04:05 +000093# ok($tokens->to_json, 'To json');
Nils Diewald90a23f22014-10-31 02:16:14 +000094
Nils Diewaldc95607a2014-11-03 21:04:05 +000095#b($tokens->to_json)->spurt('AGA-03828.json');
96b($tokens->to_json)->spurt('D59-00089.json');
Nils Diewald24b04462014-11-01 00:16:38 +000097
Nils Diewaldc95607a2014-11-03 21:04:05 +000098# timestr(timediff(Benchmark->new, $t));