Added links for wikipedia resources

Change-Id: I9525947e7538990ab94cc19f2642dd54e3a84e77
diff --git a/Changes b/Changes
index 7133cf4..64d6a84 100644
--- a/Changes
+++ b/Changes
@@ -2,6 +2,8 @@
         - Support for 'koral:field' array.
         - Support for Koral versioning.
         - Added tests for english sources.
+        - Added support for external links for
+          Wikipedia resources.
 
 0.36 2019-01-22
         - Support for non-word tokens (fixes #5).
diff --git a/lib/KorAP/XML/Meta/Base.pm b/lib/KorAP/XML/Meta/Base.pm
index b4997c4..de9ad44 100644
--- a/lib/KorAP/XML/Meta/Base.pm
+++ b/lib/KorAP/XML/Meta/Base.pm
@@ -230,11 +230,15 @@
 };
 
 sub _attachement_field {
+  my $value = $_[1];
+  if (index($value, 'data:') != 0) {
+    $value = 'data:,' . $value;
+  };
   return {
     '@type' => 'koral:field',
     type    => 'type:attachement',
     key     => $_[0],
-    value   => 'data:,' . $_[1]
+    value   => $value
   };
 };
 
diff --git a/lib/KorAP/XML/Meta/I5.pm b/lib/KorAP/XML/Meta/I5.pm
index ed55d72..a34c82e 100644
--- a/lib/KorAP/XML/Meta/I5.pm
+++ b/lib/KorAP/XML/Meta/I5.pm
@@ -346,6 +346,11 @@
       if (my $ref_text = _squish $temp->all_text) {
         $ref_text =~ s!$REF_RE!!;
         $self->{A_reference} = $ref_text;
+
+      # In case of Wikipedia texts, take the URL
+        if ($ref_text =~ /URL:(http:.+?):\s+Wikipedia,\s+\d+\s*$/) {
+          $self->{A_externalLink} = 'data:application/x.korap-link;title=Wikipedia,' . $1;
+        };
       };
     };
 
diff --git a/t/real/wdd.t b/t/real/wdd.t
index 2327677..2b44500 100644
--- a/t/real/wdd.t
+++ b/t/real/wdd.t
@@ -48,6 +48,8 @@
 REF
 is($meta->{S_language}, 'de', 'Language');
 
+is($meta->{A_externalLink}, 'data:application/x.korap-link;title=Wikipedia,http://de.wikipedia.org/wiki/Diskussion:Gunter_A._Pilz', 'link');
+
 is($meta->{T_corpus_title}, 'Wikipedia', 'Correct Corpus title');
 ok(!$meta->{T_corpus_sub_title}, 'Correct Corpus sub title');
 ok(!$meta->{T_corpus_author}, 'Correct Corpus author');
@@ -274,6 +276,9 @@
 is($doc->doc_sigle, 'WDD15/A79', 'Correct document sigle');
 is($doc->corpus_sigle, 'WDD15', 'Correct corpus sigle');
 
+$meta = $doc->meta;
+is($meta->{A_externalLink}, 'data:application/x.korap-link;title=Wikipedia,http://de.wikipedia.org/wiki/Diskussion:Arteria_interossea_communis', 'link');
+
 # Get tokenization
 $tokens = KorAP::XML::Tokenizer->new(
   path => $doc->path,
diff --git a/t/real/wpd.t b/t/real/wpd.t
index 344c636..517f9be 100644
--- a/t/real/wpd.t
+++ b/t/real/wpd.t
@@ -43,6 +43,9 @@
 ok(!$meta->{T_corpus_title}, 'Correct Corpus title');
 ok(!$meta->{T_corpus_sub_title}, 'Correct Corpus Sub title');
 
+# This link is broken, but that's due to the data
+is($meta->{A_externalLink}, 'data:application/x.korap-link;title=Wikipedia,http://de.wikipedia.org', 'No link');
+
 # Tokenization
 use_ok('KorAP::XML::Tokenizer');
 
@@ -96,6 +99,9 @@
 is($doc->doc_sigle, 'WPD15/W28', 'Correct document sigle');
 is($doc->corpus_sigle, 'WPD15', 'Correct corpus sigle');
 
+$meta = $doc->meta;
+is($meta->{A_externalLink}, 'data:application/x.korap-link;title=Wikipedia,http://de.wikipedia.org/wiki/Wolfgang_Krebs_(Schauspieler)', 'link');
+
 # Get tokenization
 $tokens = KorAP::XML::Tokenizer->new(
   path => $doc->path,
@@ -126,6 +132,9 @@
 
 is($doc->text_sigle, 'WPD15/U43/34816', 'Correct text sigle');
 
+$meta = $doc->meta;
+is($meta->{A_externalLink}, 'data:application/x.korap-link;title=Wikipedia,http://de.wikipedia.org/wiki/Universitätsbibliothek_Augsburg', 'link');
+
 # Tokenization
 use_ok('KorAP::XML::Tokenizer');
 
diff --git a/t/real/wpe.t b/t/real/wpe.t
index db626f9..a3413d2 100644
--- a/t/real/wpe.t
+++ b/t/real/wpe.t
@@ -34,6 +34,8 @@
 
 is($meta->{A_reference}, 'Generation X, In: Wikipedia - URL:http://en.wikipedia.org/wiki/Generation_X: Wikipedia, 2015', 'Reference');
 
+is($meta->{A_externalLink}, 'data:application/x.korap-link;title=Wikipedia,http://en.wikipedia.org/wiki/Generation_X', 'link');
+
 is($meta->{'S_availability'}, 'CC-BY-SA', 'Availability');
 is($meta->{'S_language'}, 'en', 'Language');
 
@@ -88,5 +90,14 @@
 is($stream->[20]->[4], 'tt/l:historian', 'Treetagger');
 is($stream->[20]->[5], 'tt/p:NNS', 'Treetagger');
 
+
+my $koral = decode_json($tokens->to_json(0.4));
+
+my $link = $koral->{fields}->[5];
+is($link->{'@type'}, 'koral:field', 'attachement');
+is($link->{type}, 'type:attachement', 'attachement');
+is($link->{key}, 'externalLink', 'attachement');
+is($link->{value}, 'data:application/x.korap-link;title=Wikipedia,http://en.wikipedia.org/wiki/Generation_X', 'attachement');
+
 done_testing;
 __END__
diff --git a/t/tokenization.t b/t/tokenization.t
index 33b0911..eecb1d7 100644
--- a/t/tokenization.t
+++ b/t/tokenization.t
@@ -68,8 +68,8 @@
 $json = decode_json $tokens->to_json(0.4);
 is($json->{fields}->[0]->{key}, 'corpusSigle');
 is($json->{fields}->[0]->{value}, 'WPD');
-is($json->{fields}->[6]->{key}, 'creationDate');
-is($json->{fields}->[6]->{value}, '2005');
+is($json->{fields}->[7]->{key}, 'creationDate');
+is($json->{fields}->[7]->{value}, '2005');
 
 is($json->{data}->{name}, 'tokens');
 is($json->{data}->{tokenSource}, 'opennlp#tokens');