Zip data.xml before tokens.xml
Makes it more (binary) compatible with private/Ingestion and current DeReKo zips
and less error provoking for old scripts.
Change-Id: Id3fbb94a0decaaa61f9659572c5cfad6520b471e
diff --git a/script/tei2korapxml b/script/tei2korapxml
index eed322d..0e72931 100755
--- a/script/tei2korapxml
+++ b/script/tei2korapxml
@@ -335,6 +335,20 @@
#
+ # Encode and escape data
+ my $escaped_data = escape_xml_minimal(encode( "UTF-8", $data ));
+ # note: the index still refers to the 'single character'-versions,
+ # which are counted as 1 (search for '&' in data.xml and see
+ # corresponding indices in $_tokens_file)
+
+ if ($_DEBUG) {
+ $log->debug("Writing (utf8-formatted) xml file $dir/$_data_file");
+ };
+
+ $zipper->new_stream("$dir/$_data_file")
+ ->print("$data_prfx1$text_id_esc$data_prfx2$escaped_data$data_sfx");
+
+
# ~ tokenization ~
if ( $_GEN_TOK_EXT ){
@@ -363,19 +377,6 @@
$cons_tok->reset;
};
- # Encode and escape data
- $data = escape_xml_minimal(encode( "UTF-8", $data ));
- # note: the index still refers to the 'single character'-versions,
- # which are counted as 1 (search for '&' in data.xml and see
- # corresponding indices in $_tokens_file)
-
- if ($_DEBUG) {
- $log->debug("Writing (utf8-formatted) xml file $dir/$_data_file");
- };
-
- $zipper->new_stream("$dir/$_data_file")
- ->print("$data_prfx1$text_id_esc$data_prfx2$data$data_sfx");
-
# ~ write structures ~
write_structures() if @structures;