blob: be90c4a80166982a1ec31207c3591667963ca8b1 [file] [log] [blame]
use strict;
use warnings;
use Test::More;
use File::Basename 'dirname';
use File::Spec::Functions qw/catfile/;
use Test::XML::Loy;
use FindBin;
use utf8;
BEGIN {
eval {
require KorAP::XML::TEI::Tokenizer::KorAP;
1;
} or do {
plan skip_all => "KorAP::XML::TEI::Tokenizer::KorAP cannot be used";
};
};
use_ok('KorAP::XML::TEI::Annotations::Collector');
require_ok('KorAP::XML::TEI::Tokenizer::KorAP');
my $f = dirname(__FILE__);
my $cmd = catfile($f, 'cmd', 'tokenizer.pl');
my $ext = KorAP::XML::TEI::Tokenizer::KorAP->new(1);
$ext->tokenize("Der alte Mann");
my $str = $ext->to_string('unknown');
my $t = Test::XML::Loy->new($str);
$t->attr_is('layer spanList span:nth-child(1)', 'to', 3);
$t->attr_is('layer spanList span:nth-child(2)', 'from', 4);
$t->attr_is('layer spanList span:nth-child(2)', 'to', 8);
$t->attr_is('layer spanList span:nth-child(3)', 'from', 9);
$t->attr_is('layer spanList span:nth-child(3)', 'to', 13);
$t->element_count_is('layer spanList span', 3);
$ext->tokenize("ging über die Straße");
$str = $ext->to_string('unknown');
$t = Test::XML::Loy->new($str);
$t->attr_is('layer spanList span:nth-child(1)', 'to', 4);
$t->attr_is('layer spanList span:nth-child(2)', 'from', 5);
$t->attr_is('layer spanList span:nth-child(2)', 'to', 9);
$t->attr_is('layer spanList span:nth-child(3)', 'from', 10);
$t->attr_is('layer spanList span:nth-child(3)', 'to', 13);
$t->attr_is('layer spanList span:nth-child(4)', 'from', 14);
$t->attr_is('layer spanList span:nth-child(4)', 'to', 20);
$t->element_count_is('layer spanList span', 4);
$ext->reset;
$ext->tokenize("Hu aha\x{04}\ndas ist cool");
$str = $ext->to_string('unknown');
$t = Test::XML::Loy->new($str);
$t->attr_is('layer spanList span:nth-child(1)', 'to', 2);
$t->attr_is('layer spanList span:nth-child(2)', 'from', 3);
$t->attr_is('layer spanList span:nth-child(2)', 'to', 6);
$t->element_count_is('layer spanList span', 2);
my $string = "Pluto.\" Eris-Entdecker Mike Brown, der im Kurznachrichtendienst Twitter unter \"\@plutokiller";
$ext->reset;
$ext->tokenize($string);
$str = $ext->to_string('unknown');
$t = Test::XML::Loy->new($str);
$t->attr_is('layer spanList span:nth-child(14)', 'from', 80);
$t->attr_is('layer spanList span:nth-child(14)', 'to', 92);
$t->element_count_is('layer spanList span', 14);
my $structures = KorAP::XML::TEI::Annotations::Collector->new;
$ext->sentencize_from_previous_input($structures);
$t = Test::XML::Loy->new($structures->[-1]->to_string(3));
$t->attr_is('span', 'from', 6)
->attr_is('span', 'to', 92)
->attr_is('span', 'l', -1, "sentence splitting with korap tokenizer");
$string = "Gefunden auf www.wikipedia.de";
$ext->reset;
$ext->tokenize($string);
$str = $ext->to_string('unknown');
$t = Test::XML::Loy->new($str);
$t->attr_is('layer spanList span:nth-child(3)', 'from', 13);
$t->attr_is('layer spanList span:nth-child(3)', 'to', 29);
$t->element_count_is('layer spanList span', 3);
$string = "J'ai j'habite qu'il d'un jusqu'à Aujourd'hui D'accord Quelqu'un Presqu'île";
$ext->reset;
$ext->tokenize($string);
$str = $ext->to_string('unknown');
$t = Test::XML::Loy->new($str);
$t->element_count_is('layer spanList span', 15);
$string = "isn't I've we'll you're";
$ext->reset;
$ext->tokenize($string);
$str = $ext->to_string('unknown');
$t = Test::XML::Loy->new($str);
$t->element_count_is('layer spanList span', 8);
$string = "Lu'hafen W'schaft gibt's";
$ext->reset;
$ext->tokenize($string);
$str = $ext->to_string('unknown');
$t = Test::XML::Loy->new($str);
$t->element_count_is('layer spanList span', 3);
$string = "'Luhafen 'Wschaft";
$ext->reset;
$ext->tokenize($string);
$str = $ext->to_string('unknown');
$t = Test::XML::Loy->new($str);
$t->element_count_is('layer spanList span', 4);
# Tests for issue #115
$string = "Die Serb*innen wie die Kosovo-Albaner*innen";
$ext->reset;
$ext->tokenize($string);
$str = $ext->to_string('issue-115');
$t = Test::XML::Loy->new($str);
$t->element_count_is('layer spanList span', 5, 'Issue #115 - token count');
$t->attr_is('layer spanList span:nth-child(2)', 'from', 4, 'Issue #115 - Serb*innen from');
$t->attr_is('layer spanList span:nth-child(2)', 'to', 14, 'Issue #115 - Serb*innen to');
$t->attr_is('layer spanList span:nth-child(5)', 'from', 23, 'Issue #115 - Kosovo-Albaner*innen from');
$t->attr_is('layer spanList span:nth-child(5)', 'to', 43, 'Issue #115 - Kosovo-Albaner*innen to');
# Tests for issue #114
$string = "[_EMOJI:{{S|;)}}_]";
$ext->reset;
$ext->tokenize($string);
$str = $ext->to_string('issue-114');
$t = Test::XML::Loy->new($str);
$t->element_count_is('layer spanList span', 1, 'Issue #114 - token count');
$t->element_exists('layer spanList span:nth-child(1)[from="0"]', 'Issue #114 - EMOJI from');
$t->attr_is('layer spanList span:nth-child(1)', 'to', 18, 'Issue #114 - EMOJI to');
# Tests for issue #113
$string = "✊🏿";
$ext->reset;
$ext->tokenize($string);
$str = $ext->to_string('issue-113-1');
$t = Test::XML::Loy->new($str);
$t->element_count_is('layer spanList span', 1, 'Issue #113 - emoji modifier count');
$t->element_exists('layer spanList span:nth-child(1)[from="0"]', 'Issue #113 - emoji modifier from');
$t->attr_is('layer spanList span:nth-child(1)', 'to', 2, 'Issue #113 - emoji modifier to');
$string = "👨‍👨‍👦"; # U+1F468 U+200D U+1F468 U+200D U+1F466
$ext->reset;
$ext->tokenize($string);
$str = $ext->to_string('issue-113-2');
$t = Test::XML::Loy->new($str);
$t->element_count_is('layer spanList span', 1, 'Issue #113 - emoji ZWJ family 1 count');
$t->element_exists('layer spanList span:nth-child(1)[from="0"]', 'Issue #113 - emoji ZWJ family 1 from');
$t->attr_is('layer spanList span:nth-child(1)', 'to', 5, 'Issue #113 - emoji ZWJ family 1 to');
$string = "👨‍👦‍👦"; # U+1F468 U+200D U+1F466 U+200D U+1F466
$ext->reset;
$ext->tokenize($string);
$str = $ext->to_string('issue-113-3');
$t = Test::XML::Loy->new($str);
$t->element_count_is('layer spanList span', 1, 'Issue #113 - emoji ZWJ family 2 count');
$t->element_exists('layer spanList span:nth-child(1)[from="0"]', 'Issue #113 - emoji ZWJ family 2 from');
$t->attr_is('layer spanList span:nth-child(1)', 'to', 5, 'Issue #113 - emoji ZWJ family 2 to');
done_testing;