Added links for wikipedia resources
Change-Id: I9525947e7538990ab94cc19f2642dd54e3a84e77
diff --git a/Changes b/Changes
index 7133cf4..64d6a84 100644
--- a/Changes
+++ b/Changes
@@ -2,6 +2,8 @@
- Support for 'koral:field' array.
- Support for Koral versioning.
- Added tests for english sources.
+ - Added support for external links for
+ Wikipedia resources.
0.36 2019-01-22
- Support for non-word tokens (fixes #5).
diff --git a/lib/KorAP/XML/Meta/Base.pm b/lib/KorAP/XML/Meta/Base.pm
index b4997c4..de9ad44 100644
--- a/lib/KorAP/XML/Meta/Base.pm
+++ b/lib/KorAP/XML/Meta/Base.pm
@@ -230,11 +230,15 @@
};
sub _attachement_field {
+ my $value = $_[1];
+ if (index($value, 'data:') != 0) {
+ $value = 'data:,' . $value;
+ };
return {
'@type' => 'koral:field',
type => 'type:attachement',
key => $_[0],
- value => 'data:,' . $_[1]
+ value => $value
};
};
diff --git a/lib/KorAP/XML/Meta/I5.pm b/lib/KorAP/XML/Meta/I5.pm
index ed55d72..a34c82e 100644
--- a/lib/KorAP/XML/Meta/I5.pm
+++ b/lib/KorAP/XML/Meta/I5.pm
@@ -346,6 +346,11 @@
if (my $ref_text = _squish $temp->all_text) {
$ref_text =~ s!$REF_RE!!;
$self->{A_reference} = $ref_text;
+
+ # In case of Wikipedia texts, take the URL
+ if ($ref_text =~ /URL:(http:.+?):\s+Wikipedia,\s+\d+\s*$/) {
+ $self->{A_externalLink} = 'data:application/x.korap-link;title=Wikipedia,' . $1;
+ };
};
};
diff --git a/t/real/wdd.t b/t/real/wdd.t
index 2327677..2b44500 100644
--- a/t/real/wdd.t
+++ b/t/real/wdd.t
@@ -48,6 +48,8 @@
REF
is($meta->{S_language}, 'de', 'Language');
+is($meta->{A_externalLink}, 'data:application/x.korap-link;title=Wikipedia,http://de.wikipedia.org/wiki/Diskussion:Gunter_A._Pilz', 'link');
+
is($meta->{T_corpus_title}, 'Wikipedia', 'Correct Corpus title');
ok(!$meta->{T_corpus_sub_title}, 'Correct Corpus sub title');
ok(!$meta->{T_corpus_author}, 'Correct Corpus author');
@@ -274,6 +276,9 @@
is($doc->doc_sigle, 'WDD15/A79', 'Correct document sigle');
is($doc->corpus_sigle, 'WDD15', 'Correct corpus sigle');
+$meta = $doc->meta;
+is($meta->{A_externalLink}, 'data:application/x.korap-link;title=Wikipedia,http://de.wikipedia.org/wiki/Diskussion:Arteria_interossea_communis', 'link');
+
# Get tokenization
$tokens = KorAP::XML::Tokenizer->new(
path => $doc->path,
diff --git a/t/real/wpd.t b/t/real/wpd.t
index 344c636..517f9be 100644
--- a/t/real/wpd.t
+++ b/t/real/wpd.t
@@ -43,6 +43,9 @@
ok(!$meta->{T_corpus_title}, 'Correct Corpus title');
ok(!$meta->{T_corpus_sub_title}, 'Correct Corpus Sub title');
+# This link is broken, but that's due to the data
+is($meta->{A_externalLink}, 'data:application/x.korap-link;title=Wikipedia,http://de.wikipedia.org', 'No link');
+
# Tokenization
use_ok('KorAP::XML::Tokenizer');
@@ -96,6 +99,9 @@
is($doc->doc_sigle, 'WPD15/W28', 'Correct document sigle');
is($doc->corpus_sigle, 'WPD15', 'Correct corpus sigle');
+$meta = $doc->meta;
+is($meta->{A_externalLink}, 'data:application/x.korap-link;title=Wikipedia,http://de.wikipedia.org/wiki/Wolfgang_Krebs_(Schauspieler)', 'link');
+
# Get tokenization
$tokens = KorAP::XML::Tokenizer->new(
path => $doc->path,
@@ -126,6 +132,9 @@
is($doc->text_sigle, 'WPD15/U43/34816', 'Correct text sigle');
+$meta = $doc->meta;
+is($meta->{A_externalLink}, 'data:application/x.korap-link;title=Wikipedia,http://de.wikipedia.org/wiki/Universitätsbibliothek_Augsburg', 'link');
+
# Tokenization
use_ok('KorAP::XML::Tokenizer');
diff --git a/t/real/wpe.t b/t/real/wpe.t
index db626f9..a3413d2 100644
--- a/t/real/wpe.t
+++ b/t/real/wpe.t
@@ -34,6 +34,8 @@
is($meta->{A_reference}, 'Generation X, In: Wikipedia - URL:http://en.wikipedia.org/wiki/Generation_X: Wikipedia, 2015', 'Reference');
+is($meta->{A_externalLink}, 'data:application/x.korap-link;title=Wikipedia,http://en.wikipedia.org/wiki/Generation_X', 'link');
+
is($meta->{'S_availability'}, 'CC-BY-SA', 'Availability');
is($meta->{'S_language'}, 'en', 'Language');
@@ -88,5 +90,14 @@
is($stream->[20]->[4], 'tt/l:historian', 'Treetagger');
is($stream->[20]->[5], 'tt/p:NNS', 'Treetagger');
+
+my $koral = decode_json($tokens->to_json(0.4));
+
+my $link = $koral->{fields}->[5];
+is($link->{'@type'}, 'koral:field', 'attachement');
+is($link->{type}, 'type:attachement', 'attachement');
+is($link->{key}, 'externalLink', 'attachement');
+is($link->{value}, 'data:application/x.korap-link;title=Wikipedia,http://en.wikipedia.org/wiki/Generation_X', 'attachement');
+
done_testing;
__END__
diff --git a/t/tokenization.t b/t/tokenization.t
index 33b0911..eecb1d7 100644
--- a/t/tokenization.t
+++ b/t/tokenization.t
@@ -68,8 +68,8 @@
$json = decode_json $tokens->to_json(0.4);
is($json->{fields}->[0]->{key}, 'corpusSigle');
is($json->{fields}->[0]->{value}, 'WPD');
-is($json->{fields}->[6]->{key}, 'creationDate');
-is($json->{fields}->[6]->{value}, '2005');
+is($json->{fields}->[7]->{key}, 'creationDate');
+is($json->{fields}->[7]->{value}, '2005');
is($json->{data}->{name}, 'tokens');
is($json->{data}->{tokenSource}, 'opennlp#tokens');