Always percentage encode data URIs

Change-Id: I1f41f6bc15acb54d5e8d695304ce11120c950e06
diff --git a/Changes b/Changes
index a3fc7ea..646b018 100644
--- a/Changes
+++ b/Changes
@@ -1,4 +1,4 @@
-0.42 2021-10-11
+0.42 2022-01-11
         - Replaced Log4perl with Log::Any.
         - Ignore level < 0 structures in DeReKo, but support
           them for base annotations.
@@ -6,6 +6,7 @@
         - Add GitHub action for CI.
         - Remove MANIFEST file from repo.
         - Introduce Gingko support.
+        - Fix data URIs to always encode percentage-wise.
 
 0.41 2020-08-10
         - Added support for RWK annotations.
diff --git a/lib/KorAP/XML/Meta/Base.pm b/lib/KorAP/XML/Meta/Base.pm
index d0afe93..21e4297 100644
--- a/lib/KorAP/XML/Meta/Base.pm
+++ b/lib/KorAP/XML/Meta/Base.pm
@@ -1,5 +1,6 @@
 package KorAP::XML::Meta::Base;
 # use Mojo::Log;
+use Mojo::Util 'url_escape';
 use Log::Any qw($log);
 use strict;
 use warnings;
@@ -176,6 +177,21 @@
   return \@fields;
 };
 
+sub korap_data_uri {
+  my $self = shift;
+  my $data = shift;
+  my %attributes = @_;
+
+  my $link = 'data:application/x.korap-link;';
+
+  foreach (sort CORE::keys %attributes) {
+    $link .= url_escape($_) . '=' . url_escape($attributes{$_}) . ';';
+  };
+
+  chop $link;
+  return $link . ',' . url_escape($data);
+};
+
 sub _k {
   my $x = substr($_[0], 2);
   $x =~ s/_(\w)/\U$1\E/g;
diff --git a/lib/KorAP/XML/Meta/I5.pm b/lib/KorAP/XML/Meta/I5.pm
index a8b31a5..4dd4b9d 100644
--- a/lib/KorAP/XML/Meta/I5.pm
+++ b/lib/KorAP/XML/Meta/I5.pm
@@ -3,7 +3,6 @@
 use Mojo::Util qw/url_escape/;
 
 our $SIGLE_RE = qr/^([^_\/]+)(?:[_\/]([^\._\/]+?)(?:\.(.+?))?)?$/;
-our $KORAP_LINK_PREF = 'data:application/x.korap-link;';
 
 # STRING:
 #   "pubPlace",
@@ -359,7 +358,7 @@
 
         # In case of Wikipedia texts, take the URL
         if ($ref_text =~ /URL:(http:.+?):\s+Wikipedia,\s+\d+\s*$/) {
-          $self->{A_externalLink} = $KORAP_LINK_PREF . 'title=Wikipedia,' . $1;
+          $self->{A_externalLink} = $self->korap_data_uri($1, title => 'Wikipedia');
         };
       };
     };
@@ -380,9 +379,9 @@
     if ($self->{T_title} && !$self->{A_externalLink} && $self->{_corpus_sigle} =~ /^(?:[AD]GD|FOLK)$/) {
       my $transcript = $self->{T_title};
       $transcript =~ s/_DF_\d+$//i;
-      $self->{A_externalLink} = $KORAP_LINK_PREF . 'title=DGD,' .
+      $self->{A_externalLink} = $self->korap_data_uri(
         'https://dgd.ids-mannheim.de/DGD2Web/ExternalAccessServlet?command=displayData&id=' .
-        url_escape($transcript);
+          url_escape($transcript), title => 'DGD');
     }
   };
 
diff --git a/t/meta_artificial.t b/t/meta_artificial.t
index 777d14d..e6c3d10 100644
--- a/t/meta_artificial.t
+++ b/t/meta_artificial.t
@@ -52,6 +52,13 @@
 is($meta->{S_text_type}, 'Zeitung: Tageszeitung', 'No text_type');
 is($meta->{S_text_type_art}, 'Bericht', 'text_type art');
 
+use_ok('KorAP::XML::Meta::I5');
+
+$meta = new KorAP::XML::Meta::I5();
+
+is('data:application/x.korap-link;example=%20Das%20war%20einfach;title=Hallo%21,https%3A%2F%2Fwww.test.de',
+   $meta->korap_data_uri('https://www.test.de', title => 'Hallo!', example => ' Das war einfach'));
+
 done_testing;
 __END__
 
diff --git a/t/real/agd.t b/t/real/agd.t
index c1f8cc5..0bf6f12 100644
--- a/t/real/agd.t
+++ b/t/real/agd.t
@@ -32,8 +32,8 @@
 is($meta->{D_creation_date}, '20181112', 'Title');
 
 is($meta->{A_externalLink}, 'data:application/x.korap-link;title=DGD,'.
-     'https://dgd.ids-mannheim.de/DGD2Web/ExternalAccessServlet?command=displayData'.
-     '&id=FOLK_E_00321_SE_01_T_01', 'External link');
+     'https%3A%2F%2Fdgd.ids-mannheim.de%2FDGD2Web%2FExternalAccessServlet%3F'.
+     'command%3DdisplayData%26id%3DFOLK_E_00321_SE_01_T_01', 'External link');
 
 # Tokenization
 use_ok('KorAP::XML::Tokenizer');
@@ -136,7 +136,9 @@
 $meta = $doc->meta;
 is($meta->{T_title}, 'FOLK_E_00068_SE_01_T_05_DF_01', 'Title');
 
-is($meta->{A_externalLink}, 'data:application/x.korap-link;title=DGD,https://dgd.ids-mannheim.de/DGD2Web/ExternalAccessServlet?command=displayData&id=FOLK_E_00068_SE_01_T_05');
+is($meta->{A_externalLink}, 'data:application/x.korap-link;title=DGD,'.
+     'https%3A%2F%2Fdgd.ids-mannheim.de%2FDGD2Web%2FExternalAccessServlet'.
+     '%3Fcommand%3DdisplayData%26id%3DFOLK_E_00068_SE_01_T_05');
 
 # Tokenization
 use_ok('KorAP::XML::Tokenizer');
diff --git a/t/real/gingko.t b/t/real/gingko.t
index 35cb8de..84ae9d7 100644
--- a/t/real/gingko.t
+++ b/t/real/gingko.t
@@ -69,7 +69,7 @@
 ok(!$meta->{T_doc_author}, 'Correct Doc author');
 is($meta->{A_doc_editor}, 'Prof. Dr. Christian Fandrych, Leipzig University', 'Correct Doc editor');
 
-# Ginkgo Metadata
+# Gingko Metadata
 is($meta->{S_gingko_genre_main}, 'wissenschaftlich');
 is($meta->{S_gingko_genre_sub}, 'wissenschaftlich');
 is($meta->{T_gingko_source}, 'ATZ - Automobiltechnische Zeitschrift');
@@ -110,15 +110,15 @@
 
 # Unknown
 unlike($token, qr!gingko/l!, 'data');
-like($token, qr!ginkgo/p:NN!, 'data');
+like($token, qr!gingko/p:NN!, 'data');
 
 $token = join('||', @{$output->{data}->{stream}->[9]});
 
 like($token, qr!i:heutige!, 'data');
-like($token, qr!ginkgo/p:ADJA!, 'data');
+like($token, qr!gingko/p:ADJA!, 'data');
 like($token, qr!gingko/l:heutig!, 'data');
 
-# Check Ginkgo meta in Koral
+# Check Gingko meta in Koral
 my $koral = decode_json($tokens->to_json(0.4));
 
 my $test = 0;
diff --git a/t/real/wdd.t b/t/real/wdd.t
index 779a44e..89cbba1 100644
--- a/t/real/wdd.t
+++ b/t/real/wdd.t
@@ -52,7 +52,7 @@
 REF
 is($meta->{S_language}, 'de', 'Language');
 
-is($meta->{A_externalLink}, 'data:application/x.korap-link;title=Wikipedia,http://de.wikipedia.org/wiki/Diskussion:Gunter_A._Pilz', 'link');
+is($meta->{A_externalLink}, 'data:application/x.korap-link;title=Wikipedia,http%3A%2F%2Fde.wikipedia.org%2Fwiki%2FDiskussion%3AGunter_A._Pilz', 'link');
 
 is($meta->{T_corpus_title}, 'Wikipedia', 'Correct Corpus title');
 ok(!$meta->{T_corpus_sub_title}, 'Correct Corpus sub title');
@@ -281,7 +281,7 @@
 is($doc->corpus_sigle, 'WDD15', 'Correct corpus sigle');
 
 $meta = $doc->meta;
-is($meta->{A_externalLink}, 'data:application/x.korap-link;title=Wikipedia,http://de.wikipedia.org/wiki/Diskussion:Arteria_interossea_communis', 'link');
+is($meta->{A_externalLink}, 'data:application/x.korap-link;title=Wikipedia,http%3A%2F%2Fde.wikipedia.org%2Fwiki%2FDiskussion%3AArteria_interossea_communis', 'link');
 
 # Get tokenization
 $tokens = KorAP::XML::Tokenizer->new(
diff --git a/t/real/wpd.t b/t/real/wpd.t
index ad14631..8ae04a6 100644
--- a/t/real/wpd.t
+++ b/t/real/wpd.t
@@ -48,7 +48,7 @@
 ok(!$meta->{T_corpus_sub_title}, 'Correct Corpus Sub title');
 
 # This link is broken, but that's due to the data
-is($meta->{A_externalLink}, 'data:application/x.korap-link;title=Wikipedia,http://de.wikipedia.org', 'No link');
+is($meta->{A_externalLink}, 'data:application/x.korap-link;title=Wikipedia,http%3A%2F%2Fde.wikipedia.org', 'No link');
 
 # Tokenization
 use_ok('KorAP::XML::Tokenizer');
@@ -111,7 +111,7 @@
 is($doc->corpus_sigle, 'WPD15', 'Correct corpus sigle');
 
 $meta = $doc->meta;
-is($meta->{A_externalLink}, 'data:application/x.korap-link;title=Wikipedia,http://de.wikipedia.org/wiki/Wolfgang_Krebs_(Schauspieler)', 'link');
+is($meta->{A_externalLink}, 'data:application/x.korap-link;title=Wikipedia,http%3A%2F%2Fde.wikipedia.org%2Fwiki%2FWolfgang_Krebs_%28Schauspieler%29', 'link');
 
 # Get tokenization
 $tokens = KorAP::XML::Tokenizer->new(
@@ -144,7 +144,7 @@
 is($doc->text_sigle, 'WPD15/U43/34816', 'Correct text sigle');
 
 $meta = $doc->meta;
-is($meta->{A_externalLink}, 'data:application/x.korap-link;title=Wikipedia,http://de.wikipedia.org/wiki/Universitätsbibliothek_Augsburg', 'link');
+is($meta->{A_externalLink}, 'data:application/x.korap-link;title=Wikipedia,http%3A%2F%2Fde.wikipedia.org%2Fwiki%2FUniversit%E4tsbibliothek_Augsburg');
 
 # Tokenization
 use_ok('KorAP::XML::Tokenizer');
diff --git a/t/real/wpe.t b/t/real/wpe.t
index 33a4236..08be909 100644
--- a/t/real/wpe.t
+++ b/t/real/wpe.t
@@ -38,7 +38,7 @@
 
 is($meta->{A_reference}, 'Generation X, In: Wikipedia - URL:http://en.wikipedia.org/wiki/Generation_X: Wikipedia, 2015', 'Reference');
 
-is($meta->{A_externalLink}, 'data:application/x.korap-link;title=Wikipedia,http://en.wikipedia.org/wiki/Generation_X', 'link');
+is($meta->{A_externalLink}, 'data:application/x.korap-link;title=Wikipedia,http%3A%2F%2Fen.wikipedia.org%2Fwiki%2FGeneration_X', 'link');
 
 is($meta->{'S_availability'}, 'CC-BY-SA', 'Availability');
 is($meta->{'S_language'}, 'en', 'Language');
@@ -102,7 +102,7 @@
 is($link->{'@type'}, 'koral:field', 'attachement');
 is($link->{type}, 'type:attachement', 'attachement');
 is($link->{key}, 'externalLink', 'attachement');
-is($link->{value}, 'data:application/x.korap-link;title=Wikipedia,http://en.wikipedia.org/wiki/Generation_X', 'attachement');
+is($link->{value}, 'data:application/x.korap-link;title=Wikipedia,http%3A%2F%2Fen.wikipedia.org%2Fwiki%2FGeneration_X', 'attachement');
 
 done_testing;
 __END__
diff --git a/t/real/wpf.t b/t/real/wpf.t
index e44e0ba..a7b07e5 100644
--- a/t/real/wpf.t
+++ b/t/real/wpf.t
@@ -47,7 +47,7 @@
 ok(!$meta->{T_corpus_sub_title}, 'Correct Corpus Sub title');
 
 # This link is broken, but that's due to the data
-is($meta->{A_externalLink}, 'data:application/x.korap-link;title=Wikipedia,http://fr.wikipedia.org/wiki/Psychanalyse', 'No link');
+is($meta->{A_externalLink}, 'data:application/x.korap-link;title=Wikipedia,http%3A%2F%2Ffr.wikipedia.org%2Fwiki%2FPsychanalyse', 'No link');
 
 # Tokenization
 use_ok('KorAP::XML::Tokenizer');