Fixed windows support
Change-Id: I042ab736bb0fc6e7dce17c330b3bc663be60cc79
diff --git a/.gitignore b/.gitignore
index 2aee81a..827ad4c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -16,6 +16,7 @@
t/corpus/BRZ13
nytprof.out
nytprof
+\#*#
*.tar.gz
*~
*.sqlite
diff --git a/Changes b/Changes
index 140240b..f3f575a 100644
--- a/Changes
+++ b/Changes
@@ -1,3 +1,6 @@
+0.21 2016-10-14
+ - Improved Windows support
+
0.20 2016-10-15
- Fixed treatment of temporary folders in script
diff --git a/lib/KorAP/XML/Annotation/Glemm/Morpho.pm b/lib/KorAP/XML/Annotation/Glemm/Morpho.pm
index f96aee5..c4ccea5 100644
--- a/lib/KorAP/XML/Annotation/Glemm/Morpho.pm
+++ b/lib/KorAP/XML/Annotation/Glemm/Morpho.pm
@@ -1,5 +1,7 @@
package KorAP::XML::Annotation::Glemm::Morpho;
use KorAP::XML::Annotation::Base;
+use strict;
+use warnings;
sub parse {
my $self = shift;
@@ -9,15 +11,16 @@
layer => 'morpho',
cb => sub {
my ($stream, $token) = @_;
+
my $mtt = $stream->pos($token->pos);
- my $content = $token->hash->{fs}->{f} or return;
+ my $content = $token->hash->{'fs'}->{'f'} or return;
# All interpretations
foreach (ref $content eq 'ARRAY' ? @$content : $content) {
# All features
- $content = $_->{fs}->{f};
+ $content = $_->{'fs'}->{'f'};
my $lemma;
my ($composition, $derivation) = (0,0);
diff --git a/lib/KorAP/XML/Batch/File.pm b/lib/KorAP/XML/Batch/File.pm
index 76a4a46..0f56b3c 100644
--- a/lib/KorAP/XML/Batch/File.pm
+++ b/lib/KorAP/XML/Batch/File.pm
@@ -77,7 +77,7 @@
$file = IO::Compress::Gzip->new($output, TextFlag => 1, Minimal => 1);
}
else {
- $file = IO::File->new($output, "w");
+ $file = IO::File->new($output, "w"); # '>:encoding(UTF-8)'); # "w");
};
# Write to output
diff --git a/lib/KorAP/XML/Krill.pm b/lib/KorAP/XML/Krill.pm
index ba99cec..f94c07b 100644
--- a/lib/KorAP/XML/Krill.pm
+++ b/lib/KorAP/XML/Krill.pm
@@ -15,7 +15,7 @@
use Data::Dumper;
use File::Spec::Functions qw/catdir catfile catpath splitdir splitpath rel2abs/;
-our $VERSION = '0.20';
+our $VERSION = '0.21';
has 'path';
has [qw/text_sigle doc_sigle corpus_sigle/];
@@ -66,7 +66,6 @@
}
else {
-
# Load file
$file = b($data_xml)->slurp;
try {
@@ -109,7 +108,7 @@
$self->log->warn($unable . ': No primary data found');
return;
};
-
+
# Associate primary data
$self->{pd} = KorAP::XML::Document::Primary->new($pd);
@@ -119,7 +118,8 @@
# Parse the corpus file, the doc file,
# and the text file for meta information
foreach (0..2) {
- unshift @header, '/' . catfile(@path, 'header.xml');
+ # Removed starting '/'
+ unshift @header, catfile(@path, 'header.xml');
pop @path;
};
diff --git a/lib/KorAP/XML/Tokenizer.pm b/lib/KorAP/XML/Tokenizer.pm
index 5b9ee6f..27fcd24 100644
--- a/lib/KorAP/XML/Tokenizer.pm
+++ b/lib/KorAP/XML/Tokenizer.pm
@@ -336,7 +336,6 @@
return;
};
-
$self->log->trace(
($param{skip} ? 'Skip' : 'Add').' token data '.$param{foundry}.':'.$param{layer}
);
@@ -397,8 +396,9 @@
my $mod = 'KorAP::XML::Annotation::' . $foundry . '::' . $layer;
if ($mod->can('new') || eval("require $mod; 1;")) {
-
- if (my $retval = $mod->new($self)->parse(@_)) {
+ my $obj = $mod->new($self);
+
+ if (my $retval = $obj->parse(@_)) {
# This layer is supported
$self->support($foundry => $layer, @_);
diff --git a/lib/KorAP/XML/Tokenizer/Tokens.pm b/lib/KorAP/XML/Tokenizer/Tokens.pm
index 50d62fa..49ca7b6 100644
--- a/lib/KorAP/XML/Tokenizer/Tokens.pm
+++ b/lib/KorAP/XML/Tokenizer/Tokens.pm
@@ -3,6 +3,7 @@
use Mojo::ByteStream 'b';
use KorAP::XML::Tokenizer::Token;
use Carp qw/croak carp/;
+use File::Spec::Functions qw/catdir catfile/;
use XML::Fast;
use Try::Tiny;
@@ -13,7 +14,8 @@
sub parse {
my $self = shift;
- my $path = $self->path . $self->foundry . '/' . $self->layer . '.xml';
+ # my $path = $self->path . $self->foundry . '/' . $self->layer . '.xml';
+ my $path = catfile($self->path, $self->foundry, $self->layer . '.xml');
# Legacy data support
unless (-e $path) {
@@ -22,7 +24,7 @@
return unless -e $path;
}
elsif ($self->layer eq 'morpho' && $self->foundry eq 'glemm') {
- $path = $self->path . $self->foundry . '/glemm.xml';
+ $path = catfile($self->path, $self->foundry, 'glemm.xml');
return unless -e $path;
}
else {
@@ -34,8 +36,8 @@
# Bug workaround
if ($self->foundry eq 'glemm') {
- if (index($file, "</span\n") > 0) {
- $file =~ s!</span$!</span>!gm
+ if (index($file, "</span\n") > 0 || index($file, "</span\r") > 0) {
+ $file =~ s!</span[\n\r]!</span>\n!g;
};
};
diff --git a/script/korapxml2krill b/script/korapxml2krill
index 6b7b9df..42292ad 100644
--- a/script/korapxml2krill
+++ b/script/korapxml2krill
@@ -65,11 +65,14 @@
# - Added MDParser#Dependency
#
# 2016/10/15
-# - Fixed temporary path issue in script.
+# - Fixed temporary path issue in script
+#
+# 2016/10/24
+# - Improved Windows support
#
# ----------------------------------------------------------
-our $LAST_CHANGE = '2016/10/15';
+our $LAST_CHANGE = '2016/10/24';
our $LOCAL = $FindBin::Bin;
our $VERSION_MSG = <<"VERSION";
Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
diff --git a/t/annotation/connexor_sentences.t b/t/annotation/connexor_sentences.t
index f03cec0..0c2891c 100644
--- a/t/annotation/connexor_sentences.t
+++ b/t/annotation/connexor_sentences.t
@@ -17,7 +17,7 @@
path => $path . '/'
), 'Load Korap::Document');
-like($doc->path, qr!$path/$!, 'Path');
+like($doc->path, qr!\Q$path\E/$!, 'Path');
ok($doc->parse, 'Parse document');
ok($doc->primary->data, 'Primary data in existence');
diff --git a/t/annotation/mdp_dependency.t b/t/annotation/mdp_dependency.t
index 5b6858c..30a8708 100644
--- a/t/annotation/mdp_dependency.t
+++ b/t/annotation/mdp_dependency.t
@@ -20,16 +20,22 @@
use lib 't/annotation';
use File::Temp qw/tempdir/;
-use_ok('KorAP::XML::Annotation::MDParser::Dependency');
-use_ok('KorAP::XML::Archive');
-use_ok('KorAP::XML::Krill');
-use_ok('KorAP::XML::Tokenizer');
+use KorAP::XML::Archive;
my $name = 'wpd15-single';
my @path = (dirname(__FILE__), '..', 'corpus','archives');
my $file = catfile(@path, $name . '.zip');
-ok(my $archive = KorAP::XML::Archive->new($file), 'Create archive');
+my $archive = KorAP::XML::Archive->new($file);
+
+unless ($archive->test_unzip) {
+ plan skip_all => 'unzip not found';
+};
+
+use_ok('KorAP::XML::Annotation::MDParser::Dependency');
+use_ok('KorAP::XML::Krill');
+use_ok('KorAP::XML::Tokenizer');
+
ok($archive->attach('#' . catfile(@path, $name . '.mdparser.zip')), 'Attach mdparser archive');
diff --git a/t/annotation/meta.t b/t/annotation/meta.t
index f641ccf..d4773d7 100644
--- a/t/annotation/meta.t
+++ b/t/annotation/meta.t
@@ -16,7 +16,7 @@
ok(my $doc = KorAP::XML::Krill->new( path => $path . '/' ), 'Load Korap::Document');
ok($doc->parse, 'Parse document');
-like($doc->path, qr!$path/!, 'Path');
+like($doc->path, qr!\Q$path\E/!, 'Path');
# Metdata
is($doc->text_sigle, 'Corpus/Doc/0001', 'ID-text');
diff --git a/t/annotation/primary.t b/t/annotation/primary.t
index 32431d3..1bc23a0 100644
--- a/t/annotation/primary.t
+++ b/t/annotation/primary.t
@@ -15,7 +15,7 @@
ok(my $doc = KorAP::XML::Krill->new( path => $path . '/' ), 'Load Korap::XML::Krill');
ok($doc->parse, 'Parse document');
-like($doc->path, qr!$path/!, 'Path');
+like($doc->path, qr!\Q$path\E/!, 'Path');
is($doc->primary->data,
'Zum letzten kulturellen Anlass lädt die Leitung des Schulheimes Hofbergli ein, '.
diff --git a/t/archive.t b/t/archive.t
index 9d2ce68..3d40549 100644
--- a/t/archive.t
+++ b/t/archive.t
@@ -6,7 +6,7 @@
use File::Spec::Functions qw/catfile catdir/;
use File::Temp qw/tempdir/;
-use_ok('KorAP::XML::Archive');
+use KorAP::XML::Archive;
my $file = catfile(dirname(__FILE__), 'corpus','archive.zip');
my $archive = KorAP::XML::Archive->new($file);
diff --git a/t/batch_file.t b/t/batch_file.t
index 947e1ef..50b59a8 100644
--- a/t/batch_file.t
+++ b/t/batch_file.t
@@ -112,7 +112,7 @@
ok($bf->process($path => $output), 'Process file');
ok(-f $output, 'File exists');
ok($file = slurp $output, 'Slurp data');
-like($file, qr/^\{\n\s+"/, 'No pretty printing');
+like($file, qr/^\{[\n\s]+"/, 'No pretty printing');
# Check overwriting
$bf->{overwrite} = 0;
diff --git a/t/meta.t b/t/meta.t
index 78be93c..a775cbd 100644
--- a/t/meta.t
+++ b/t/meta.t
@@ -16,10 +16,10 @@
# WPD/00001
my $path = catdir(dirname(__FILE__), 'corpus/WPD/00001');
ok(my $doc = KorAP::XML::Krill->new( path => $path . '/' ), 'Load Korap::Document');
-like($doc->path, qr!$path/!, 'Path');
+like($doc->path, qr!\Q$path\E/!, 'Path');
ok($doc = KorAP::XML::Krill->new( path => $path ), 'Load Korap::Document');
-like($doc->path, qr!$path/$!, 'Path');
+like($doc->path, qr!\Q$path\E/$!, 'Path');
ok($doc->parse, 'Parse document');
@@ -28,6 +28,7 @@
my $meta = $doc->meta;
is($meta->{title}, 'A', 'title');
+
ok(!$meta->{sub_title}, 'subTitle');
is($doc->corpus_sigle, 'WPD', 'corpusID');
is($meta->{pub_date}, '20050328', 'pubDate');
@@ -39,6 +40,7 @@
ok(!$meta->{text_class}->[4], 'TextClass');
is($meta->{author}, 'Ruru; Jens.Ol; Aglarech; u.a.', 'author');
+
#is($doc->author->[0], 'Ruru', 'author');
#is($doc->author->[1], 'Jens.Ol', 'author');
#is($doc->author->[2], 'Aglarech', 'author');
@@ -286,10 +288,10 @@
# Multipath headers
$path = catdir(dirname(__FILE__), 'corpus/VDI/JAN/00001');
ok($doc = KorAP::XML::Krill->new( path => $path . '/' ), 'Load Korap::Document');
-like($doc->path, qr!$path/!, 'Path');
+like($doc->path, qr!\Q$path\E/!, 'Path');
ok($doc = KorAP::XML::Krill->new( path => $path ), 'Load Korap::Document');
-like($doc->path, qr!$path/$!, 'Path');
+like($doc->path, qr!\Q$path\E/$!, 'Path');
ok($doc->parse, 'Parse document');
$meta = $doc->meta;
@@ -339,7 +341,7 @@
# WDD
$path = catdir(dirname(__FILE__), 'corpus/WDD/G27/38989');
ok($doc = KorAP::XML::Krill->new( path => $path . '/' ), 'Load Korap::Document');
-like($doc->path, qr!$path/!, 'Path');
+like($doc->path, qr!\Q$path\E/!, 'Path');
ok($doc->parse, 'Parse document');
$meta = $doc->meta;
diff --git a/t/meta_caching.t b/t/meta_caching.t
index 4ee4cb2..a0787fa 100644
--- a/t/meta_caching.t
+++ b/t/meta_caching.t
@@ -26,7 +26,7 @@
cache => $cache
), 'Get doc');
-like($doc->path, qr!$path/!, 'Path');
+like($doc->path, qr!\Q$path\E/!, 'Path');
ok(!$cache->get('REI'), 'No REI set');
ok(!$cache->get('REI/BNG'), 'No REI/BNG set');
diff --git a/t/multiple_archives.t b/t/multiple_archives.t
index fbe14e3..7865101 100644
--- a/t/multiple_archives.t
+++ b/t/multiple_archives.t
@@ -6,7 +6,7 @@
use File::Spec::Functions qw/catfile catdir/;
use File::Temp qw/tempdir/;
-use_ok('KorAP::XML::Archive');
+use KorAP::XML::Archive;
my $name = 'wpd15-single';
my @path = (dirname(__FILE__), 'corpus','archives');
@@ -15,7 +15,7 @@
my $archive = KorAP::XML::Archive->new($file);
unless ($archive->test_unzip) {
- plan skip_all => 'unzip not found';
+ plan skip_all => 'unzip not found';
};
ok($archive->test, 'Test archive');
diff --git a/t/real/bzk.t b/t/real/bzk.t
index 76d01ca..90a0c58 100644
--- a/t/real/bzk.t
+++ b/t/real/bzk.t
@@ -3,6 +3,7 @@
use Test::More;
use Data::Dumper;
use JSON::XS;
+use Log::Log4perl;
use Benchmark qw/:hireswallclock/;
@@ -11,12 +12,19 @@
use utf8;
use lib 'lib', '../lib';
+#Log::Log4perl->init({
+# 'log4perl.rootLogger' => 'DEBUG, STDERR',
+# 'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels',
+# 'log4perl.appender.STDERR.layout' => 'PatternLayout',
+# 'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n'
+#});
+
use File::Basename 'dirname';
use File::Spec::Functions 'catdir';
use_ok('KorAP::XML::Krill');
-my $path = catdir(dirname(__FILE__), '../corpus/BZK/D59/00001');
+my $path = catdir(dirname(__FILE__), '..', 'corpus', 'BZK', 'D59', '00001');
ok(my $doc = KorAP::XML::Krill->new( path => $path . '/' ), 'Load Korap::Document');
ok($doc->parse, 'Parse document');
diff --git a/t/script/archive.t b/t/script/archive.t
index 308414a..d797c20 100644
--- a/t/script/archive.t
+++ b/t/script/archive.t
@@ -10,6 +10,7 @@
use Test::More;
use Test::Output qw/:stdout :stderr :functions/;
use Data::Dumper;
+use KorAP::XML::Archive;
use utf8;
my $f = dirname(__FILE__);
@@ -21,6 +22,10 @@
'archive'
);
+unless (KorAP::XML::Archive::test_unzip) {
+ plan skip_all => 'unzip not found';
+};
+
# Test without parameters
stdout_like(
sub {
diff --git a/t/script/extract.t b/t/script/extract.t
index 2ea1e13..8e7cdfd 100644
--- a/t/script/extract.t
+++ b/t/script/extract.t
@@ -10,6 +10,7 @@
use Test::More;
use Test::Output;
use Data::Dumper;
+use KorAP::XML::Archive;
use utf8;
my $f = dirname(__FILE__);
@@ -21,6 +22,10 @@
'extract'
);
+unless (KorAP::XML::Archive::test_unzip) {
+ plan skip_all => 'unzip not found';
+};
+
# Test without parameters
stdout_like(
sub {
diff --git a/t/script/single.t b/t/script/single.t
index 9d8d28f..40b7940 100644
--- a/t/script/single.t
+++ b/t/script/single.t
@@ -72,6 +72,7 @@
while ($gz->read($buffer)) {
$file .= $buffer;
};
+$gz->close;
ok($json = decode_json($file), 'decode json');
@@ -83,8 +84,8 @@
is($json->{data}->{stream}->[0]->[0], '-:base/paragraphs$<i>1', 'Paragraphs');
# Delete output
-unlink $output;
-ok(!-f $output, 'Output does not exist');
+is(unlink($output), 1, 'Unlink successful');
+ok(!-e $output, 'Output does not exist');
# Use a different token source and skip all annotations,
# except for DeReKo#Structure and Mate#Dependency
diff --git a/t/sgbr/meta.t b/t/sgbr/meta.t
index 91b0d8d..20b5ad8 100644
--- a/t/sgbr/meta.t
+++ b/t/sgbr/meta.t
@@ -17,7 +17,7 @@
ok($doc->parse, 'Parse document');
-like($doc->path, qr!$path/!, 'Path');
+like($doc->path, qr!\Q$path\E/!, 'Path');
# Metdata
is($doc->text_sigle, 'TEST/BSP/1', 'ID-text');
diff --git a/t/sgbr/meta_duden.t b/t/sgbr/meta_duden.t
index a375307..14f9746 100644
--- a/t/sgbr/meta_duden.t
+++ b/t/sgbr/meta_duden.t
@@ -17,7 +17,7 @@
ok($doc->parse, 'Parse document');
-like($doc->path, qr!$path/!, 'Path');
+like($doc->path, qr!\Q$path\E/!, 'Path');
# Metdata
is($doc->text_sigle, 'PRO-DUD/BSP-2013-01/32', 'ID-text');
diff --git a/t/sgbr/meta_ids.t b/t/sgbr/meta_ids.t
index 5c9c628..8a4cc4e 100644
--- a/t/sgbr/meta_ids.t
+++ b/t/sgbr/meta_ids.t
@@ -17,7 +17,7 @@
ok($doc->parse, 'Parse document');
-like($doc->path, qr!$path/!, 'Path');
+like($doc->path, qr!\Q$path\E/!, 'Path');
# Metdata
is($doc->text_sigle, 'CMC-TSK/2014-09/2843', 'ID-text');
@@ -85,7 +85,7 @@
ok($doc->parse, 'Parse document');
-like($doc->path, qr!$path/!, 'Path');
+like($doc->path, qr!\Q$path\E/!, 'Path');
# Metdata
is($doc->text_sigle, 'CMC-TSK/2014-09/3401', 'ID-text');
diff --git a/t/transform.t b/t/transform.t
index 522b267..798e6c2 100644
--- a/t/transform.t
+++ b/t/transform.t
@@ -70,10 +70,10 @@
my $path = catdir(dirname(__FILE__), 'corpus/WPD/00001');
ok(my $doc = KorAP::XML::Krill->new( path => $path . '/' ), 'Load Korap::Document');
-like($doc->path, qr!$path/$!, 'Path');
+like($doc->path, qr!\Q$path\E/$!, 'Path');
ok($doc = KorAP::XML::Krill->new( path => $path ), 'Load Korap::Document');
-like($doc->path, qr!$path/$!, 'Path');
+like($doc->path, qr!\Q$path\E/$!, 'Path');
ok($doc->parse, 'Parse document');
@@ -106,7 +106,7 @@
), 'New Tokenizer');
ok($tokens->parse, 'Parse');
-like($tokens->path, qr!$path/$!, 'Path');
+like($tokens->path, qr!\Q$path\E/$!, 'Path');
is($tokens->foundry, 'OpenNLP', 'Foundry');
is($tokens->doc->text_sigle, 'WPD/AAA/00001', 'Doc id');
is($tokens->should, 1068, 'Should');