| ## #! /appl/soft/bio/bioperl/5.36.0/bin/perl |
| ## #! /usr/bin/perl -w |
| |
| |
| ########################################################################################################################################################### |
| # vrt2tei.pl |
| # eureco |
| # leibniz-institut fuer deutsche sprache / csc finland esbo |
| # august 2024 |
| # |
| # |
| # using XML::Twig , see http://www.xmltwig.org/, https://metacpan.org/pod/XML::Twiga |
| # |
| # usage: see below the usage fugnction |
| # Usage: ./vrt2tei.pl <vrtxmlfile.xml> <outfile> |
| # <vrtxmlfile>: xml-ised vrt file |
| # |
| # |
| # TODO: |
| |
| # 0 ZIPPEN mit tei2korapxml; zippen mit korAP indexing |
| |
| # 2 threading on compute node and application on sub corpora of KLK |
| # 2 build 30 billion corpus and index it |
| # 3 Optionen |
| # 3a parametrize deprel for I5 and if Nils is not ready yet |
| |
| # |
| |
| # 5 abfangen von unerwarteten elementen dh andere als <sentence> und <paragraph> |
| # 6 checks and balances, wort reihenfolge nochmal checken? |
| # 7 Encode Kielipankki and National Library of Finland? in teiCorpus Header |
| # 8 How to encode the CLARIN-RES better - more Info from the CMDI |
| # 9 construct <idsDoc>s independent of the order of texts, probably with writing intermediate files to zip |
| # 10 re-implementation of the gawk code in the perl script |
| # 11 Wwedish corpus |
| |
| # |
| # |
| ############################################################################################################################################################ |
| |
| use strict; |
| use warnings; |
| #use diagnostics; |
| |
| use Getopt::Std; |
| use XML::Twig; |
| use XML::Generator ':pretty'; # apparently no effect when using flush(); |
| |
| |
| use locale; # diese drei Zeilen, damit \b im regex nicht Umlaute und ß matcht. |
| use POSIX qw(locale_h); # to be able to use setlocale() |
| #setlocale(LC_ALL,'de_DE'); |
| setlocale(LC_ALL, "fi_FI"); |
| use utf8; |
| use open qw( :std :encoding(UTF-8) ); |
| |
| use Time::Piece; |
| use Tie::IxHash; |
| use Data::Random::String; |
| |
| |
| |
| |
| #------------- |
| # get options |
| #------------- |
| |
| our ($opt_h, $opt_m, $opt_s, $opt_t); |
| |
| # read switches and print usage info if some bad option was given |
| if (!getopts('hms:t:')) { # switches with ':' take an argument; switches without ':' are boolean flags |
| &usage_message; |
| exit -1; |
| } |
| |
| |
| #-------------------- |
| # check argument(s) |
| #-------------------- |
| |
| # currently one argument: the vrt-xml input file |
| |
| unless($ARGV[0]) {&usage_message()} ; # min arg0, the input file e.g. Suomen_Kuvalehti2021.xml |
| if ($ARGV[1]) {&usage_message()}; # max arg0, the input file |
| |
| |
| |
| #------------------------------------------------------------ |
| # initialize defaults for options |
| #------------------------------------------------------------ |
| my $TEIFORMAT= "tei"; |
| my $MASK = 0; |
| |
| |
| #---------------------------------------------------------------------------------------------------------- |
| # interpret the options and check whether their respective argument is meaningful (if applicable) |
| #---------------------------------------------------------------------------------------------------------- |
| |
| |
| # option -h: display usage info and exit |
| if ($opt_h) { |
| print STDERR &usage_message; |
| exit 0; |
| } |
| |
| |
| |
| # option -t |
| if (defined($opt_t)) { |
| $TEIFORMAT = $opt_t; |
| } |
| |
| |
| if ($TEIFORMAT !~ /(tei|i5)/i) { # case-insenstive |
| print STDERR "Error: invalid arg for option -t"; |
| &usage_message; |
| exit 0; |
| } |
| if($TEIFORMAT eq "tei"){$TEIFORMAT="TEI"}; |
| if($TEIFORMAT eq "i5") {$TEIFORMAT="I5"}; |
| |
| # option -m |
| if ($opt_m) { |
| $MASK = 1; |
| } |
| |
| |
| #----------------------------------------------- |
| # OTHER GLOBAL VARIABLES |
| #----------------------------------------------- |
| |
| my $encoding = "UTF-8"; # dieses $encoding ist NUR fuer das output s.u. twig funktion |
| |
| my $DTDDECL = 'idsCorpus PUBLIC "-//IDS//DTD IDS-I5 1.0//EN" "http://corpora.ids-mannheim.de/I5/DTD/i5.dtd"'; # for I5 |
| |
| my $textcounter = 0; |
| my $LASTMONTH = 0; |
| |
| our %corpusids = (); |
| our %srcpublids = (); |
| our %srcfullnames = (); |
| our %srcpubplaces = (); |
| our %srcpublishers = (); |
| our %srctexttypes = (); |
| our %srctextlangs = (); |
| |
| our %expandLang = (); |
| |
| |
| my %doccounter = ( # by the month as in dereko |
| "01" => 1, |
| "02" => 1, |
| "03" => 1, |
| "04" => 1, |
| "05" => 1, |
| "06" => 1, |
| "07" => 1, |
| "08" => 1, |
| "09" => 1, |
| "10" => 1, |
| "11" => 1, |
| "12" => 1, |
| ); |
| |
| # global variables pertaining to the original corpus of *all* newspapers: |
| my $kielipankkiCorpus = "klk-fi-v2-vrt"; |
| my $kielipankkiLicense = "CLARIN-RES"; |
| my $CountryKey = "FI"; |
| |
| # Table with metadata about the different sources (newspapers) |
| my $sourcescsvfile = "sources_klk_fi_v2_2021_4eureco.csv"; |
| |
| # corpusheader and textheader skeletons |
| my $corpheaderfile = "teiCorpusHeaderSkeleton.tei.xml"; |
| my $textheaderfile = "teiTextHeaderSkeleton.tei.xml"; |
| if($TEIFORMAT eq "I5"){ |
| $corpheaderfile = "i5CorpusHeaderSkeleton.i5.xml"; |
| $textheaderfile = "i5TextHeaderSkeleton.i5.xml"; |
| } |
| |
| |
| my $twig=""; |
| |
| # variables $fnsource and $fnyear derived from the filename |
| my @array = split(/\//, $ARGV[0]); |
| my $l = scalar(@array); |
| my $fnsource = $array[$l-1]; |
| $fnsource =~ s/([0-9][0-9][0-9][0-9])\.xml$//; |
| |
| my $fnyear = $1; # $1 contains substring in first bracket in regex above |
| my $fnYY = substr($fnyear, 2, 2); |
| |
| |
| # months |
| my %months = ( |
| "01" => "JAN", |
| "02" => "FEB", |
| "03" => "MAR", |
| "04" => "APR", |
| "05" => "MAY", |
| "06" => "JUN", |
| "07" => "JUL", |
| "08" => "AUG", |
| "09" => "SEP", |
| "10" => "OCT", |
| "11" => "NOV", |
| "12" => "DEC", |
| ); |
| |
| my %monthnames = ( |
| "01" => "January", |
| "02" => "February", |
| "03" => "March", |
| "04" => "April", |
| "05" => "May", |
| "06" => "June", |
| "07" => "July", |
| "08" => "August", |
| "09" => "September", |
| "10" => "October", |
| "11" => "November", |
| "12" => "December", |
| ); |
| |
| my %mapping = (); |
| $mapping{"aikakausi"} = "Zeitschrift"; |
| $mapping{"sanomalehti"} = "Zeitung"; |
| |
| |
| |
| #------------------------------------------------------------------------------------------- |
| # read source metadata file (prepared manually => ultimately read the info from CMDI File?) |
| # and set variables |
| #------------------------------------------------------------------------------------------- |
| |
| open(my $SOURCES, "< $sourcescsvfile") || die("$0: cannot open file for reading: $sourcescsvfile"); |
| while(my $fline = <$SOURCES>){ |
| chomp($fline); |
| |
| if ($fline=~/^\#/ || $fline=~/^\s*$/ || $fline =~ "TEICORPUSID"){next;} # skip line if empty line or comment line or first line |
| my @flarray = split(/\s*\t+\s*/, $fline); # split each line into array |
| |
| # (ToDo: the following hashes could probably conflated into an array of hashes or so) |
| |
| # set full titles ($flarray[1]) as keys: |
| $corpusids{$flarray[2]} = $flarray[0]; |
| $srcpublids{$flarray[2]} = $flarray[1]; |
| $srcfullnames{$flarray[2]} = $flarray[2]; |
| $srcpubplaces{$flarray[2]} = $flarray[7]; |
| $srcpublishers{$flarray[2]} = $flarray[8]; |
| $srctexttypes{$flarray[2]} = $flarray[5]; |
| $srctextlangs{$flarray[2]} = $flarray[6]; |
| |
| # also set simple titles ($flarray[2]) as keys: |
| $corpusids{$flarray[3]} = $flarray[0]; |
| $srcpublids{$flarray[3]} = $flarray[1]; |
| $srcfullnames{$flarray[3]} = $flarray[2]; |
| $srcpubplaces{$flarray[3]} = $flarray[7]; |
| $srcpublishers{$flarray[3]} = $flarray[8]; |
| $srctexttypes{$flarray[3]} = $flarray[5]; |
| $srctextlangs{$flarray[3]} = $flarray[6]; |
| } |
| close($SOURCES); |
| |
| $expandLang{"fi"} = "Finnish"; |
| $expandLang{"sv"} = "Swedish"; |
| |
| |
| #------------------------------------------------------------------ |
| # read corpusHeaderSkeleton document and start a twig for it |
| # (since this file need not be streamed, no handlers are needed) |
| #------------------------------------------------------------------ |
| |
| my $teiCorpusHeaderDocTwig = new XML::Twig( |
| keep_spaces => 1, |
| keep_atts_order => 1, |
| comments => 'drop', |
| ); |
| |
| |
| $teiCorpusHeaderDocTwig->parsefile($corpheaderfile); |
| my $corpusHeader = $teiCorpusHeaderDocTwig->root; # getting the teiHeader for corpus out of the teiCorpusHeaderSkeleton document |
| |
| |
| #------------------------------------------------------------------ |
| # read textHeaderSkeleton document and start a twig for it |
| #------------------------------------------------------------------ |
| |
| my $teiTextHeaderDocTwig = new XML::Twig( |
| keep_spaces => 1, |
| keep_atts_order => 1, |
| comments => 'drop', |
| ); |
| |
| $teiTextHeaderDocTwig->parsefile($textheaderfile); |
| my $textHeader = $teiTextHeaderDocTwig->root; # getting the teiHeader for corpus out of the teiTextHeaderSkeleton document |
| |
| |
| #--------------------------------------------------------- |
| # define a subtree for idsDoc |
| # for the time being it will only be used for the first |
| # idsDoc header, to be inserted in the root hander |
| #--------------------------------------------------------- |
| |
| my $idsDoc = XML::Twig::Elt->new('idsDoc'); |
| my $idsDocHeader = XML::Twig::Elt->new('idsHeader'); |
| |
| if($TEIFORMAT eq "I5"){ |
| my $docFileDesc = XML::Twig::Elt->new('fileDesc'); |
| my $docTitleStmt = XML::Twig::Elt->new('titleStmt'); |
| my $dtitle = XML::Twig::Elt->new('d.title'); |
| my $docSigle = XML::Twig::Elt->new('dokumentSigle'); |
| |
| my $docPublicationStmt = XML::Twig::Elt->new('publicationStmt'); |
| my $docDistributor = XML::Twig::Elt->new('distributor'); |
| my $docPubAddress = XML::Twig::Elt->new('pubAddress'); |
| my $docAvailability = XML::Twig::Elt->new('availability'); |
| my $docPubDate = XML::Twig::Elt->new('pubDate'); |
| |
| my $docSourceDesc = XML::Twig::Elt->new('sourceDesc'); |
| my $docBiblStruct = XML::Twig::Elt->new('biblStruct'); |
| my $docMonogr = XML::Twig::Elt->new('monogr'); |
| my $docHTitle = XML::Twig::Elt->new('h.title'); |
| my $docImprint = XML::Twig::Elt->new('imprint'); |
| |
| $idsDoc -> set_att('version', "1.0"); |
| $idsDoc -> set_att('TEIform', "TEI.2"); |
| |
| $idsDocHeader -> set_att('version', "1.1"); |
| $idsDocHeader -> set_att('type', "document"); |
| $idsDocHeader -> set_att('pattern', "text"); |
| $idsDocHeader -> set_att('TEIform', "teiHeader"); |
| |
| |
| |
| $docSigle -> paste("first_child", $docTitleStmt); |
| $dtitle -> paste("last_child", $docTitleStmt); |
| $docTitleStmt -> paste("last_child", $docFileDesc); |
| $docFileDesc -> paste("last_child", $idsDocHeader); |
| $docPublicationStmt -> paste("last_child", $docFileDesc); |
| $docDistributor -> paste("last_child", $docPublicationStmt); |
| $docPubAddress -> paste("last_child", $docPublicationStmt); |
| |
| $docAvailability -> paste("last_child", $docPublicationStmt); |
| $docPubDate -> paste("last_child", $docPublicationStmt); |
| |
| $docSourceDesc -> paste("last_child", $docFileDesc); |
| $docBiblStruct -> paste("last_child", $docSourceDesc); |
| $docMonogr -> paste("last_child", $docBiblStruct); |
| $docHTitle -> paste("last_child", $docMonogr); |
| $docImprint -> paste("last_child", $docMonogr); |
| |
| $idsDocHeader -> paste("last_child", $idsDoc); |
| |
| $docSigle->set_text($corpusids{$fnsource} . $fnYY . "/JAN"); |
| $dtitle ->set_text($srcfullnames{$fnsource} . ", January " . $fnyear); |
| } |
| |
| |
| |
| #---------------------------------- |
| # read the input VRT-XML document |
| #---------------------------------- |
| |
| open(my $IN, "< $ARGV[0]") || die("$0: cannot open file for reading: $ARGV[0]"); # open input file and initialise filehandel, actually does not seem to be needed |
| # as parsefile() (s.b.) is applied to the filename |
| |
| |
| |
| ##################### |
| # M A I N |
| ##################### |
| |
| #------------------------------------------------------------------------------------------------------------- |
| # start twig for input and call start tag handler for root and twig handler for each <text> in the VRT |
| #------------------------------------------------------------------------------------------------------------- |
| |
| |
| $twig = new XML::Twig( |
| keep_spaces => 1, # dadurch auch whitespaces an ehemeligen elementgrenzen im output |
| keep_atts_order => 1, # requires Tie::IxHash |
| comments => 'drop', |
| start_tag_handlers => { |
| texts => sub{root(@_, $corpusHeader)} |
| }, |
| |
| twig_handlers => { |
| # text => \&text |
| text => sub{text(@_, $textHeader->copy)} # copy must be because textHeader will be flushed with $twig in the <text> handler; |
| }, |
| |
| output_encoding => $encoding, |
| ); |
| |
| $twig->parsefile($ARGV[0]); |
| |
| |
| ########### |
| # END MAIN |
| ########### |
| |
| |
| |
| |
| ############################## |
| # S U B R O U T I N E S |
| ############################## |
| |
| sub root { |
| my ($twig, $root, $corpusHeader) =@_; |
| |
| if($TEIFORMAT eq "I5") { |
| $twig->set_doctype($DTDDECL); # the doctype could probably be set anywhere to the twig |
| $root->set_gi('idsCorpus'); |
| $root->set_att('version', "1.0"); |
| $root->set_att('TEIform', "teiCorpus.2"); |
| |
| } |
| else { |
| $root->set_gi('teiCorpus'); |
| $root->set_att("xmlns", 'http://www.tei-c.org/ns/1.0'); |
| } |
| |
| &insertCorpusHeader($root, $corpusHeader); |
| } |
| |
| |
| |
| sub insertCorpusHeader{ |
| my ($root, $corpusHeader) =@_; |
| |
| my $ident = "ident"; |
| |
| #----------------------- |
| # set corpus header |
| #----------------------- |
| |
| &set_title($corpusHeader, $srcfullnames{$fnsource}, $fnyear, $kielipankkiCorpus); |
| |
| if($TEIFORMAT eq "TEI"){ |
| &set_sourceDesc($corpusHeader, $srcfullnames{$fnsource}, $fnyear, $kielipankkiCorpus); |
| } |
| elsif($TEIFORMAT eq "I5"){ |
| $ident="id"; |
| $corpusHeader->get_xpath("fileDesc/titleStmt/korpusSigle",0) ->set_text($corpusids{$fnsource} . $fnYY); |
| $corpusHeader->get_xpath("fileDesc/publicationStmt/pubDate",0)->set_text((localtime)[5] + 1900); |
| $corpusHeader->get_xpath('encodingDesc/editorialDecl/transduction[@n="2"]',0) ->set_text("I5 version by EuReCo using vrt2tei.pl " . localtime->ymd('-')); |
| &set_sourceDescI5($corpusHeader); |
| } |
| else{ |
| print STDERR "TEIFORMAT is $TEIFORMAT; must be TEI or I5"; |
| } |
| |
| $corpusHeader->paste("first_child", $root); |
| $corpusHeader->get_xpath('./profileDesc/langUsage/language', 0) ->set_att($ident, $srctextlangs{$fnsource}); |
| $corpusHeader->get_xpath('./profileDesc/langUsage/language', 0) ->set_text($expandLang{$srctextlangs{$fnsource}}); |
| |
| if($TEIFORMAT eq "I5"){ |
| $idsDoc->paste("after", $corpusHeader); |
| } |
| } |
| |
| |
| #---------------------------- |
| # handler &text for <text> |
| #---------------------------- |
| |
| sub text { |
| my ($twig, $text, $textHeader) = @_; |
| |
| $textcounter++; |
| |
| |
| # ToDo: catch all other, unexpected children of root |
| |
| #-------------------------------------------------------------------------- |
| # Get text metadata (attributes of <text>) and create teiHeader for <text> |
| #-------------------------------------------------------------------------- |
| |
| my $textattsref = $text->atts(); # $textattsref is now a reference to a hash and should be used with '->' |
| |
| &createIdsDoc($textattsref); # this creation of idsDoc will only be called for the 2nd idsDoc (i.e. february) or higher |
| |
| |
| |
| # &createTextHeader returns the $textID: |
| my $textID = &createTextHeader($text, $textattsref, $textHeader); |
| |
| |
| #---------------------------------------- |
| # create <TEI> or <idsText> from <text> |
| #---------------------------------------- |
| |
| # set vrt <text> to <TEI> and delete all attributes after they were were saved above |
| $text->del_atts; |
| |
| if($TEIFORMAT eq "TEI"){ |
| $text->set_gi("TEI"); |
| $text->set_att('xml:id', $textID); |
| } |
| else { |
| $text ->set_gi("idsText"); |
| $text ->set_att('version', "1.0"); |
| # $text ->move("last_child", $idsDoc); # does not work because apparently $idsDoc is not under $root at this point |
| |
| } |
| |
| |
| |
| #------------------------------------------------------------------ |
| # create the <tei:text>, <body>, <div> elements inside <TEI> |
| #------------------------------------------------------------------ |
| |
| my $ttext_element = XML::Twig::Elt->new('text'); |
| my $body_element = XML::Twig::Elt->new('body'); |
| my $div_element = XML::Twig::Elt->new('div'); |
| |
| # set atts |
| $div_element ->set_att("type", "page"); # ToDo: this is specific to KLK |
| $ttext_element->set_att("xml:lang", 'fi'); # as in ICC-NOR |
| |
| # paste |
| $ttext_element->paste('last_child', $text); |
| $body_element ->paste('last_child', $ttext_element); |
| $div_element ->paste('last_child', $body_element); |
| |
| |
| #------------------------------- |
| # create <p> from <paragraph> |
| #------------------------------- |
| |
| my @paragraphs = $text->children( 'paragraph'); |
| |
| foreach my $paragraph (@paragraphs) { |
| |
| &setP($paragraph); |
| |
| $paragraph->move('last_child', $div_element); |
| |
| #------------------------------ |
| # create <s> from <sentence> |
| #------------------------------ |
| |
| my @sentences = $paragraph->children('sentence'); |
| foreach my $sentence (@sentences) { |
| |
| &setS($sentence); |
| |
| |
| #-------------------------------------- |
| # create <w> (word) from each $line |
| #-------------------------------------- |
| |
| my @lines = split(/\n+/, $sentence->xml_text); |
| $sentence->set_text("\n"); |
| |
| for my $line (@lines){ # Todo: Reihenfolge checken |
| if($line ne "" ){ |
| my $w_element = XML::Twig::Elt->new('w'); |
| &createW($w_element, $line); |
| $w_element->paste('last_child', $sentence); |
| } |
| } # end words |
| } # end sentences |
| } # end paragraphs |
| |
| # $twig->set_pretty_print( 'record'); |
| # $twig->flush($OUT); |
| $twig->flush("/dev/stdout"); |
| } |
| |
| sub createTextHeader{ |
| my ($text, $textattsref, $textHeader) = @_; |
| |
| # USE 01 binding_id="2246025" |
| # USE 02 date="2021-01-15" |
| # 03 datefrom="20210115" |
| # 04 dateto="20210115" |
| # 05 elec_date="_" |
| # 06 file="" |
| # USE 07 filename_metadata="finclarin_siirto_k2021/0039-5552/2021/alto/2246025_0039-5552_2021-01-15_SK0221_mets.xml" |
| # USE 08 filename_orig ="finclarin_siirto_k2021/0039-5552/2021/alto/2246025_0039-5552_2021-01-15_SK0221_page-2021011502210030301.xml |
| # USE 09 id="t-bcd0f3fa-bbd3dac4" |
| # 10 img_url="" |
| # USE 11 issue_date="15.01.2021" |
| # USE 12 issue_no="SK0221" |
| # USE 13 issue_title="Suomen Kuvalehti" |
| # USE 14 label="Suomen Kuvalehti no. SK0221 15.01.2021" |
| # USE 16 language="fi" |
| # USE 17 page_id="p1" |
| # USE 18 page_no="None" |
| # 19 part_name="_" |
| # 20 publ_id="0039-5552" |
| # 21 publ_part="" |
| # USE 22 publ_title="Suomen Kuvalehti" |
| # USE 23 publ_type="aikakausi" |
| # USE 24 sentcount="70" |
| # USE 25 sum_lang="|xxx:44|fin:23|eng:3|" |
| # 26 timefrom="000000" |
| # 27 timeto="235959" |
| # USE 28 tokencount="304" |
| # 29 version_added="KLK-fi-2021"> |
| |
| |
| my $BID = $textattsref->{'binding_id'}; |
| my $DATE = $textattsref->{'date'}; |
| my $METAFILENAME = $textattsref->{'filename_metadata'}; |
| my $ORIGFILENAME = $textattsref->{'filename_orig'}; |
| my $ID = $textattsref->{'id'}; |
| my $ISSUEDATE = $textattsref->{'issue_date'}; |
| my $ISSUENO = $textattsref->{'issue_no'}; |
| my $ISSUETITLE = $textattsref->{'issue_title'}; |
| my $LABEL = $textattsref->{'label'}; |
| my $LANGUAGE = $textattsref->{'language'}; |
| my $PAGEID = $textattsref->{'page_id'}; |
| my $PAGENO = $textattsref->{'page_no'}; |
| my $PUBLTITLE = $textattsref->{'publ_title'}; |
| my $PUBLTYPE = $textattsref->{'publ_type'}; |
| my $SENTCOUNT = $textattsref->{'sentcount'}; |
| my $SUMLANG = $textattsref->{'sum_lang'}; |
| my $TOKENCOUNT = $textattsref->{'tokencount'}; |
| |
| |
| #----------------------------- |
| # Derived Metadata variables |
| #----------------------------- |
| |
| my @datearray = split("-", $DATE); |
| my @langarray = split("|", $SUMLANG); |
| my @namearray = split(/[\.\/]/, $ORIGFILENAME); # use $namearray[4] as ID for the page |
| |
| #---------------------------------------------------- |
| # create textSigle to be returned from this function |
| #---------------------------------------------------- |
| |
| # SUK21.JAN.00001 |
| |
| my $yy = substr($datearray[0], 2, 2); # substr EXPR,OFFSET,LENGTH |
| my $mm = $datearray[1]; # substr EXPR,OFFSET,LENGTH |
| my $MMM = $months{$mm}; |
| |
| my $CSIGLE = $corpusids{$fnsource} . $yy; |
| |
| my $textID = $CSIGLE . "_" . $MMM . "." . sprintf("%05d", $doccounter{$mm}++); |
| my $textSigle = $textID; |
| |
| |
| #----------------------------------------------------------------------- |
| # CREATE text-teiHeader ACCORDING TO THE SKELETON in $textHeader |
| #----------------------------------------------------------------------- |
| |
| |
| $textHeader->paste('first_child', $text); |
| |
| #----------------------------------------------- |
| # <teiHeader> |
| # <fileDesc n="EuReCo-KLK-FIN_[$ID]"> |
| # <titleStmt> |
| # <title>[$LABEL, page $PAGENO]</title> |
| |
| $textHeader->first_child("fileDesc") -> set_att('n', "EuReCo-". $kielipankkiCorpus . "-" . $ID); |
| |
| |
| #----------------- |
| # titleStmt |
| #---------------- |
| |
| my $title="title"; |
| my $titleStmt = $textHeader->first_child("fileDesc")->first_child("titleStmt"); |
| |
| if($TEIFORMAT eq "I5"){ |
| $title = "t.title"; |
| $textSigle =~ s/_/\//g; |
| $titleStmt->first_child("textSigle")->set_text($textSigle); |
| }; |
| |
| $titleStmt->first_child($title)->set_text($LABEL . ", Text #" . $textcounter); |
| |
| # Case KLK: PAGENO scheint meist "None" zu sein |
| |
| #----------------------------------------------- |
| # <fileDesc> |
| # <sourceDesc> |
| # <biblStruct> |
| # <analytic> |
| # <title type="main">[$LABEL, page $PAGENO]</title> |
| # <date>[$DATE]</date> |
| # <date type="year">TODO</date> |
| # <date type="month">TODO</date> |
| # <date type="day">TODO</date> |
| # <idno type="PAGEID">$PAGEID</idno> |
| # <idno type="BINDINGID">$BID</idno> |
| # <idno type="ID">$ID</idno> |
| # <idno type="KIELIPANKKI_METAFILENAME">$METAFILENAME</idno> |
| # <idno type="KIELIPANKKI_ORIGFILENAME">$ORIGFILENAME</idno> |
| # <textLang>$LANGUAGE</textLang> |
| |
| |
| my $analytic = $textHeader->get_xpath("./fileDesc/sourceDesc/biblStruct/analytic", 0); |
| if($TEIFORMAT eq "I5"){$title="h.title"}; |
| |
| |
| $analytic->first_child($title) ->set_text($LABEL . ", Text #" . $textcounter); # Case KLK; PAGENO scheint meist "None" zu sein |
| #$analytic->get_xpath('./idno[@type="PAGEID"]', 0) ->set_text($PAGEID); |
| #$analytic->get_xpath('./idno[@type="BINDINGID"]', 0) ->set_text($BID); |
| #$analytic->get_xpath('./idno[@type="ID"]', 0) ->set_text($ID); |
| #$analytic->get_xpath('./idno[@type="KIELIPANKKI_METAFILENAME"]', 0) ->set_text($METAFILENAME); |
| #$analytic->get_xpath('./idno[@type="KIELIPANKKI_ORIGFILENAME"]', 0) ->set_text($ORIGFILENAME); |
| if($TEIFORMAT eq "TEI"){ |
| $analytic->first_child('textLang') ->set_text($LANGUAGE); |
| } |
| |
| # <monogr> |
| # <title>$PUBLTITLE</title> |
| # <imprint> |
| # <pubPlace>TODO</pubPlace> |
| # <publisher>TODO</publisher> |
| # </imprint> |
| # <biblScope unit="ISSUETITLE"/> |
| # <biblScope unit="ISSUENO"/> |
| # <biblScope unit="ISSUEDATE"/> |
| # <biblScope unit="pp">$PAGENO</biblScope> |
| |
| my $monogr = $textHeader->get_xpath("./fileDesc/sourceDesc/biblStruct/monogr", 0); |
| |
| $monogr->first_child($title) ->set_text($PUBLTITLE); |
| if($TEIFORMAT eq "TEI"){ |
| $monogr->get_xpath('./imprint/date[@type="date"]', 0) ->set_text($DATE); |
| } |
| my $date = "date"; |
| if($TEIFORMAT eq "I5"){$date="pubDate"}; |
| $monogr->get_xpath('./imprint/' . $date . '[@type="year"]', 0) ->set_text($datearray[0]); |
| $monogr->get_xpath('./imprint/' . $date . '[@type="month"]', 0) ->set_text($datearray[1]); |
| $monogr->get_xpath('./imprint/' . $date . '[@type="day"]', 0) ->set_text($datearray[2]); |
| $monogr->first_child("imprint")->first_child("pubPlace") ->set_text($srcpubplaces{$PUBLTITLE}); # imprint is needed for tei validity |
| $monogr->first_child("imprint")->first_child("pubPlace") ->set_att('key', $CountryKey); |
| $monogr->first_child("imprint")->first_child("publisher") ->set_text($srcpublishers{$PUBLTITLE}); # imprint is needed for tei validity |
| #$monogr->get_xpath('./biblScope[@unit="ISSUETITLE"]', 0) ->set_text($ISSUETITLE); |
| #$monogr->get_xpath('./biblScope[@unit="ISSUENO"]', 0) ->set_text($ISSUENO); |
| #$monogr->get_xpath('./biblScope[@unit="ISSUEDATE"]', 0) ->set_text($ISSUEDATE); |
| #$monogr->get_xpath('./biblScope[@unit="pp"]', 0) ->set_text($PAGENO); # Achtung - PAGENO scheint meist "None" zu sein |
| |
| my $dateNice = $datearray[2] . "." . $datearray[1] . "." . $datearray[0]; |
| my $dateBackwards = $datearray[0] . "." . $datearray[1] . "." . $datearray[2]; |
| |
| if($TEIFORMAT eq "I5"){ |
| my $refCompleteText = $textSigle . " " . $PUBLTITLE . " no. " . $ISSUENO . ", [" . $PUBLTYPE . "], " . $dateNice; |
| my $refShortText = $textSigle . " " . $PUBLTITLE . ", " . $dateNice; |
| $textHeader->get_xpath('./fileDesc/sourceDesc/reference[@type="complete"]', 0) -> set_text($refCompleteText); |
| $textHeader->get_xpath('./fileDesc/sourceDesc/reference[@type="short"]' , 0) -> set_text($refShortText); |
| } |
| |
| |
| # <encodingDesc> |
| # <tagsDecl> |
| # <namespace name="http://www.tei-c.org/ns/1.0"> |
| # <tagUsage gi="s" occurs="SENTCOUNT"/> |
| # <tagUsage gi="w" occurs="TOKENCOUNT"/> |
| |
| my $namespacePath="./encodingDesc/tagsDecl/namespace/"; |
| if($TEIFORMAT eq "I5"){$namespacePath="./encodingDesc/tagsDecl/"}; |
| |
| $textHeader->get_xpath($namespacePath . 'tagUsage[@gi="s"]', 0) -> set_att('occurs', $SENTCOUNT); |
| $textHeader->get_xpath($namespacePath . 'tagUsage[@gi="w"]', 0) -> set_att('occurs', $TOKENCOUNT); |
| |
| # <profileDesc> |
| # <langUsage> |
| # <language ident="fi" usage="|xxx:44|fin:23|eng:3|"/> |
| # </langUsage> |
| # <textClass> |
| # <classCode scheme="kielipankki_klk">$PUBLTYPE</classCode> |
| # <classCode scheme="kielipankki_klk_mapped">$mapping{$PUBLTYPE}</classCode> |
| |
| if($TEIFORMAT eq "I5"){ |
| $textHeader->get_xpath('./profileDesc/creation/creatDate', 0) ->set_text($dateBackwards); |
| } |
| if($TEIFORMAT eq "TEI"){ |
| $textHeader->get_xpath('./profileDesc/langUsage/language', 0) ->set_att('ident', $LANGUAGE); |
| $textHeader->get_xpath('./profileDesc/langUsage/language', 0) ->set_att('usage', $SUMLANG); |
| } |
| # in @usage muss eigt. ein integer; am besten inhalt von SUMLANG aufdroeseln und mehrere <language> machen |
| |
| $textHeader->get_xpath('./profileDesc/textClass/classCode[@scheme="kielipankki_klk"]', 0) ->set_text($PUBLTYPE); |
| $textHeader->get_xpath('./profileDesc/textClass/classCode[@scheme="kielipankki_klk_mapped"]', 0)->set_text($mapping{$PUBLTYPE}); |
| |
| # <revisionDesc> |
| # <change when="TODO" who="HL">TEI version for EuReCo</change> |
| |
| $textHeader->get_xpath('./revisionDesc/change', 0) ->set_att('when', localtime->ymd('-')); |
| |
| return $textID; |
| |
| |
| #----------------------------------- |
| # END OF CREATING TEIHEADER |
| #----------------------------------- |
| |
| } |
| |
| sub setP { |
| my ($paragraph) = @_; |
| |
| $paragraph->set_gi('p'); |
| |
| # <paragraph id="p-bcd0f3fa-bbd3dac4-815ead7a" sum_lang="|fin:1|"> |
| # atts of <paragraph>: |
| # @id USE |
| # @sum_lang USE: put in xml:lang and prefix the value with "x-" for private value |
| |
| $paragraph->set_att("xml:lang", "x-" . $paragraph->att("sum_lang")); |
| $paragraph->del_att("sum_lang"); |
| # $paragraph->change_att_name('id', 'xml:id'); |
| $paragraph->del_att("id"); # diese id ist auch nicht eindeutig!! |
| } |
| sub setS { |
| my ($sentence) = @_; |
| |
| $sentence->set_gi('s'); |
| |
| # the atts of <sentence>: |
| # USE 1 @id="s-bcd0f3fa-bbd3dac4-f7429090" |
| # USE 2 @lang="fin" -> xml:lang |
| # ? 3 @lang_conf="0.6734853"> -> ToDo @cert ? |
| |
| # set attrs of <s> |
| $sentence->set_att("xml:lang", $sentence->att("lang")); # ToDo: convert the value / introduce a hash for lookup (input values: "fin", "xxx", ....) |
| # $sentence->change_att_name('id', 'xml:id'); # nicht eindeutig |
| $sentence->del_att('id'); |
| $sentence->del_att("lang"); # replaced by xml:lang |
| $sentence->del_att("lang_conf"); # for the time being |
| |
| } |
| |
| sub createW { |
| my ($w_element, $line) = @_; |
| |
| #--------------------------- |
| # Get the tags (=columns) |
| #--------------------------- |
| |
| my @tags = split(/\t/, $line); |
| |
| # set word string and lemma string according to $MASK flag: |
| my $w_string = ""; |
| my $l_string = ""; |
| if($MASK && ($tags[4] ne "Punct")){ |
| $w_string = Data::Random::String->create_random_string(length=>length($tags[0]), contains=>'alpha'); |
| $l_string = $w_string; |
| } |
| else { |
| $w_string = $tags[0]; |
| $l_string = $tags[2]; |
| } |
| $w_element->set_text($w_string); |
| |
| # vrt word and positional-attributes in corpus KLK: |
| # USE [0] word |
| # USE [1] ref (id for reference of dephead) |
| # USE [2] lemma |
| # ? [3] lemmacomp (lemma with compound info - could go in @norm, as tag abuse?) |
| # USE [4] pos |
| # USE [5] msd |
| # USE [6] dephead |
| # USE [7] deprel |
| # [8] content (ocr-process) |
| # [9] vpos (ocr-process) |
| # [10] ocr (ocr-process) |
| # [11] cc (ocr-process) |
| # [12] hyph (ocr-process) |
| # [13] style (ocr-process) |
| # [14] lex (korp semantic disambiguation from G"oteborg) |
| |
| # set the attributes of <w>: |
| $w_element->set_att("n", $tags[1]); |
| |
| # $w_element->set_att("id", "w_" . $namearray[4] . $sentence->att("xml:id") . "_" . $tags[1]); |
| # so zusammengebaute ID ist auch nicht eindeutig... |
| $w_element->del_att("id"); |
| |
| $w_element->set_att("lemma", $l_string); |
| |
| # $w_element->set_att("norm", $tags[3]); # tag abuse of @norm |
| $w_element->set_att("pos", $tags[4]); |
| $w_element->set_att("msd", $tags[5]); |
| |
| if($TEIFORMAT eq "I5"){ # remove condition when part of the official TEI |
| $w_element->set_att("head", $tags[6]); |
| $w_element->set_att("deprel", $tags[7]); |
| } |
| } |
| |
| |
| sub set_title{ |
| my ($corpusHeader, $source, $year, $kielipankkiCorpus) = @_; |
| |
| my $titleElement = "title"; |
| |
| if($TEIFORMAT eq "I5"){ |
| $titleElement = "c.title"; |
| } |
| |
| my $cTitleString = $source . " " . $year . ", from ". $kielipankkiCorpus . " for EuReCo"; |
| |
| #<teiHeader> |
| # <fileDesc> |
| # <titleStmt> |
| # <title>[Aamulehti2021] from [klk-fi-v2-vrt for] EuReCo</title> |
| # </titleStmt> |
| # <!-- ... --> |
| # </fileDesc> |
| #</teiHeader> |
| |
| my $cTitleNode = $corpusHeader->first_child("fileDesc")->first_child("titleStmt")->first_child($titleElement); |
| |
| $cTitleNode->set_text($cTitleString); |
| |
| } |
| |
| sub set_sourceDesc{ |
| my ($corpusHeader, $source, $year, $kielipankkiCorpus) = @_; |
| |
| my $cBiblString = $source . " " . $year . ", from ". $kielipankkiCorpus . " for EuReCo"; |
| |
| #<teiHeader> |
| # <fileDesc> |
| # <!-- ... --> |
| # <sourceDesc> |
| # <bibl>[Aamulehti2021] from [klk-fi-v2-vrt]</bibl> |
| # </sourceDesc> |
| # <!-- ... --> |
| # </fileDesc> |
| #</teiHeader> |
| |
| my $cBiblNode = $corpusHeader->first_child("fileDesc")->first_child("sourceDesc")->first_child("bibl"); |
| |
| $cBiblNode->set_text($cBiblString); |
| } |
| |
| sub set_sourceDescI5{ |
| my ($corpusHeader) = @_; |
| |
| my $PUBLTITLE = $srcfullnames{$fnsource}; |
| my $PUBLPLACE = $srcpubplaces{$PUBLTITLE}; |
| my $PUBLISHER = $srcpublishers{$PUBLTITLE}; |
| |
| my $YEAR = $fnyear; |
| my $YY = substr($fnyear, 2, 2); |
| |
| my $CSIGLE = $corpusids{$PUBLTITLE} . $YY; |
| |
| |
| #<idsHeader> |
| # <fileDesc> |
| # <!-- ... --> |
| # <sourceDesc> |
| # <biblStruct> |
| # <monogr> |
| # <h.title type="main">[$PUBLTITLE], [$YEAR]</h.title> |
| # <imprint> |
| # <publisher>[$PUBLISHER]</publisher> |
| # <pubPlace key="[$TL]">[$PUBPLACE]</pubPlace> |
| # </imprint> |
| # </monogr> |
| # </biblStruct> |
| # <reference type="super" assemblage="regular">[$KKK] [$PUBLTITLE]; [$PUBPLACE]: [$PUBLISHER], [$YEAR]</reference> |
| # </sourceDesc> # <sourceDesc> |
| # <!-- ... --> |
| # </fileDesc> |
| #</teiHeader> |
| |
| my $cMonogr = $corpusHeader->first_child("fileDesc")->first_child("sourceDesc")->first_child("biblStruct")->first_child("monogr"); |
| $cMonogr->first_child("h.title")->set_text($PUBLTITLE); |
| $cMonogr->first_child("imprint")->first_child("publisher")->set_text($PUBLISHER); |
| $cMonogr->first_child("imprint")->first_child("pubPlace") ->set_text($PUBLPLACE); |
| $cMonogr->first_child("imprint")->first_child("pubPlace") ->set_att('key', $CountryKey); |
| |
| $corpusHeader->first_child("fileDesc")->first_child("sourceDesc")->first_child("reference")->set_text($CSIGLE . " " . $PUBLTITLE . "; " . $PUBLPLACE . ": " . $PUBLISHER . ", " . $YEAR); |
| |
| } |
| |
| |
| |
| sub createIdsDoc{ # will only be called for the second idsDoc (i.e. for february) and higher |
| my ($textattsref) = @_; |
| my $DATE = $textattsref->{'date'}; |
| my $PUBLTITLE = $textattsref->{'publ_title'}; |
| |
| my @datearray = split("-", $DATE); |
| my $MONTH = $datearray[1]; |
| my $YEAR = $datearray[0]; |
| my $YY = substr($YEAR, 2, 2); |
| |
| my $CSIGLE = $corpusids{$PUBLTITLE} . $YY; |
| |
| my $DOCID = $months{$MONTH}; |
| my $MONTHNAME = $monthnames{$MONTH}; |
| |
| my $idsDocString=""; |
| if($TEIFORMAT eq "I5"){ |
| $idsDocString = " |
| <idsDoc version=\"1.0\" TEIform=\"TEI.2\"> |
| <idsHeader type=\"document\" pattern=\"text\" version=\"1.1\" TEIform=\"teiHeader\"> |
| <fileDesc> |
| <titleStmt> |
| <dokumentSigle>$CSIGLE/$DOCID</dokumentSigle> |
| <d.title>$PUBLTITLE, $MONTHNAME $YEAR</d.title> |
| </titleStmt> |
| <publicationStmt> |
| <distributor/> |
| <pubAddress/> |
| <availability region=\"world\">$kielipankkiLicense</availability> |
| <pubDate/> |
| </publicationStmt> |
| <sourceDesc> |
| <biblStruct> |
| <monogr> |
| <h.title/> |
| <imprint/> |
| </monogr> |
| </biblStruct> |
| </sourceDesc> |
| </fileDesc> |
| </idsHeader> |
| </idsDoc>\n"; |
| } |
| if($MONTH + 0 == $LASTMONTH + 1){ |
| if($MONTH+0 > 1){ |
| printf("%s\n", $idsDocString); |
| } |
| $LASTMONTH++; |
| } |
| } |
| |
| |
| |
| ################# |
| ## usage_message |
| ################# |
| |
| |
| sub usage_message { |
| print STDERR "Usage: ./vrt2tei.pl [OPTIONS] <file.vrt.xml>\n"; |
| print STDERR " <file.vrt.xml> is a VRT file as proper (well-formed) XML\n"; |
| print STDERR " Options:\n"; |
| print STDERR " -t (tei|i5) output format, default: tei\n"; |
| print STDERR " -m mask primary data\n"; |
| print STDERR "\n"; |
| exit; |
| } |
| |
| |