| #! /usr/bin/perl -w |
| |
| |
| ########################################################################################################################################################### |
| # vrt2tei.pl |
| # eureco |
| # leibniz-institut fuer deutsche sprache / csc finland esbo |
| # august 2024 |
| # |
| # |
| # using XML::Twig , see http://www.xmltwig.org/, https://metacpan.org/pod/XML::Twiga |
| # |
| # usage: see below the usage fugnction |
| # Usage: ./vrt2tei.pl <vrtxmlfile.xml> <outfile> |
| # <vrtxmlfile>: xml-ised vrt file |
| # |
| # |
| # TODO: |
| # 1 insert dtd spec, or ref to TEI |
| |
| # 3a remove the vrt positional attribute comment line / all comment lines |
| # 3b add @head and @deprel to I5 sowie auch @msd |
| # 3c bearbeitung von @head und @deprel in tei2korapxml durch Nils? |
| # 3d build 30 billion corpus |
| |
| # 4a take care of IDs |
| # 4b see to the values of @xml:lang |
| # 5 abfangen von unerwarteten elementen dh andere als <sentence> und <paragraph> |
| # 5a wort reihenfolge nochmal checken |
| # 6 checks and balances |
| # 7 How to encode Kielipankki and National Library of Finland? in teiCorpus Header |
| # 8 construct <idsDoc>s for the months (or go for TEI) |
| # 9 parallelisation in bash and application on sub corpora of KLK |
| # 10 re-implementation of the gawk code in the perl script |
| # 12 re-implement creation of text header from xml file in another twig / parametrize TEI vs I5 |
| |
| |
| |
| #remember |
| #formatted.xml:105613: element w: validity error : No declaration for attribute deprel of element w |
| #formatted.xml:105613: element w: validity error : No declaration for attribute head of element w |
| |
| |
| # |
| # |
| ############################################################################################################################################################ |
| |
| |
| use strict; |
| use warnings; |
| |
| use XML::Twig; |
| use XML::Generator ':pretty'; # apparently no effect when using flush(); |
| |
| |
| use locale; # diese drei Zeilen, damit \b im regex nicht Umlaute und ß matcht. |
| use POSIX qw(locale_h); # to be able to use setlocale() |
| #setlocale(LC_ALL,'de_DE'); |
| setlocale(LC_ALL, "fi_FI"); |
| use utf8; |
| use open qw( :std :encoding(UTF-8) ); |
| |
| use Time::Piece; |
| use Tie::IxHash; |
| |
| |
| |
| #---------------------- |
| # check file arguments: |
| #---------------------- |
| |
| # arg0 infile: vrt-xml |
| |
| unless($ARGV[0]) {&usage_message()} ; # min arg0, the input file |
| if ($ARGV[1]) {&usage_message()}; # max arg0, the input file |
| |
| |
| #################### |
| # GLOBAL VARIABLES |
| #################### |
| |
| my $encoding = "UTF-8"; |
| #my $encoding = "iso-8859-1"; # dieses $encoding ist NUR fuer das output s.u. twig funktion |
| my $textcounter = 0; |
| |
| |
| |
| my $twig=""; |
| my $teiCorpusHeaderDoc=""; |
| |
| |
| #------------------------------------------------------------------ |
| # read corpusHeaderSkeleton document and get header out of it |
| #------------------------------------------------------------------ |
| |
| my $teiCorpusHeaderDocTwig = new XML::Twig( |
| keep_spaces => 1, |
| keep_atts_order => 1, |
| comments => 'drop', |
| ); |
| |
| |
| $teiCorpusHeaderDocTwig->parsefile("teiCorpusHeaderSkeleton.tei.xml"); |
| my $corpusHeader = $teiCorpusHeaderDocTwig->root; # getting the teiHeader for corpus out of the teiCorpusHeaderSkeleton document |
| |
| |
| #------------------------------------------------------------------ |
| # read textHeaderSkeleton document adn get header out of it |
| #------------------------------------------------------------------ |
| |
| my $teiTextHeaderDocTwig = new XML::Twig( |
| keep_spaces => 1, |
| keep_atts_order => 1, |
| comments => 'drop', |
| ); |
| |
| $teiTextHeaderDocTwig->parsefile("teiTextHeaderSkeleton.tei.xml"); |
| my $textHeader = $teiTextHeaderDocTwig->root; # getting the teiHeader for corpus out of the teiTextHeaderSkeleton document |
| |
| |
| #---------------------------------- |
| # read input VRT-XML document |
| #---------------------------------- |
| |
| open(my $IN, "< $ARGV[0]") || die("$0: cannot open file for reading: $ARGV[0]"); # open input file and initialise filehandel, actually does not seem to be needed |
| # as parsefile() (s.b.) is applied to the filename |
| |
| #----------------------------------------------------- |
| # global variables pertaining to the original corpus |
| #----------------------------------------------------- |
| |
| my $kielipankkiCorpus = "klk-fi-v2-vrt"; |
| |
| |
| |
| |
| ##################### |
| # M A I N |
| ##################### |
| |
| #------------------------------------------------------------------------------------------------------------- |
| # start twig for input and call start tag handler for root and twig handler for each <text> in the VRT |
| #------------------------------------------------------------------------------------------------------------- |
| |
| |
| $twig = new XML::Twig( |
| keep_spaces => 1, # dadurch auch whitespaces an ehemeligen elementgrenzen im output |
| keep_atts_order => 1, # requires Tie::IxHash |
| comments => 'drop', |
| start_tag_handlers => { |
| texts => sub{root(@_, $corpusHeader)} |
| }, |
| twig_handlers => { |
| # text => \&text |
| text => sub{text(@_, $textHeader)} |
| }, |
| # dtd_handlers => { # ToDo for I5 |
| # \&set_dtd; |
| # } |
| |
| output_encoding => $encoding, |
| ); |
| |
| $twig->parsefile($ARGV[0]); |
| |
| |
| |
| |
| |
| ########### |
| # END MAIN |
| ########### |
| |
| |
| |
| |
| ############################## |
| # S U B R O U T I N E S |
| ############################## |
| |
| # sub set_dtd [ |
| # my $twig, $dtd = @_; |
| # my $internal = qq|\nPUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"\n"DTD/xhtml1-strict.dtd"|; |
| # |
| # $twig->twig_doctype('html', undef, undef, $internal); |
| # } |
| |
| |
| |
| sub root { |
| my ($twig, $root, $corpusHeader) =@_; |
| |
| $root->set_gi('teiCorpus'); |
| $root->set_att("xmlns", 'http://www.tei-c.org/ns/1.0'); |
| |
| &insertCorpusHeader($root, $corpusHeader); |
| } |
| |
| |
| |
| sub insertCorpusHeader{ |
| my ($root, $corpusHeader) =@_; |
| |
| #--------------------------------------------------------------------------- |
| # get some metadata for the current output corpus based on source and year |
| #--------------------------------------------------------------------------- |
| |
| my @array = split(/\//, $ARGV[0]); |
| my $l = scalar(@array); |
| my $source = $array[$l-1]; |
| $source =~ s/([0-9][0-9][0-9][0-9])\.xml$//; |
| |
| my $language="Finnish"; |
| my $lang_tla="fi"; |
| |
| my $yy = $1; # $1 now containts substring in first bracket in regex above |
| |
| my $ctitle = $source . " " . $yy . ", from ". $kielipankkiCorpus . " for EuReCo"; # to do: also get name of corpus (klk-fi-v2-vrt) |
| |
| |
| #----------------------- |
| # set corpus header |
| #----------------------- |
| |
| &set_title( $corpusHeader, $source, $yy, $kielipankkiCorpus); |
| &set_sourceDesc($corpusHeader, $source, $yy, $kielipankkiCorpus); |
| |
| my $teiCorpusHeader = $corpusHeader->paste("first_child", $root); |
| |
| } |
| |
| |
| #---------------------------- |
| # handler &text for <text> |
| #---------------------------- |
| |
| sub text { |
| my ($twig, $text, $textHeader) = @_; |
| |
| $textcounter++; # global variable |
| |
| # ToDo: catch all other, unexpected children of root |
| |
| #-------------------------------------------------------------------------- |
| # Get text metadata (attributes of <text>) and create teiHeader for <text> |
| #-------------------------------------------------------------------------- |
| |
| my $textattsref = $text->atts(); # $textattsref is now a reference to a hash and should be used with '->' |
| |
| |
| &createTextHeader($text, $textattsref, $textHeader); |
| |
| #-------------------------- |
| # create <TEI> from <text> |
| #-------------------------- |
| |
| # set vrt <text> to <TEI> and delete all attributes after they were were saved above |
| $text->del_atts; |
| $text->set_gi("TEI"); |
| |
| #------------------------------------------------------------------ |
| # create the <tei:text>, <body>, <div> elements inside <TEI> |
| #------------------------------------------------------------------ |
| |
| my $ttext_element = XML::Twig::Elt->new('text'); |
| my $body_element = XML::Twig::Elt->new('body'); |
| my $div_element = XML::Twig::Elt->new('div'); |
| |
| # set atts |
| $div_element ->set_att("type", "page"); # ToDo: this is specific to KLK |
| $ttext_element->set_att("xml:lang", 'fi'); # as in ICC-NOR |
| |
| # paste |
| $ttext_element->paste('last_child', $text); |
| $body_element ->paste('last_child', $ttext_element); |
| $div_element ->paste('last_child', $body_element); |
| |
| |
| #------------------------------- |
| # create <p> from <paragraph> |
| #------------------------------- |
| |
| my @paragraphs = $text->children( 'paragraph'); |
| |
| foreach my $paragraph (@paragraphs) { |
| |
| &setP($paragraph); |
| |
| $paragraph->move('last_child', $div_element); |
| |
| #------------------------------ |
| # create <s> from <sentence> |
| #------------------------------ |
| |
| my @sentences = $paragraph->children('sentence'); |
| foreach my $sentence (@sentences) { |
| |
| &setS($sentence); |
| |
| |
| #-------------------------------------- |
| # create <w> (word) from each $line |
| #-------------------------------------- |
| |
| my @lines = split(/\n+/, $sentence->xml_text); |
| $sentence->set_text("\n"); |
| |
| for my $line (@lines){ # Todo: Reihenfolge checken |
| if($line ne "" ){ |
| my $w_element = XML::Twig::Elt->new('w'); |
| &createW($w_element, $line); |
| $w_element->paste('last_child', $sentence); |
| } |
| } # end words |
| } # end sentences |
| } # end paragraphs |
| |
| # $twig->set_pretty_print( 'record'); |
| # $twig->flush($OUT); |
| $twig->flush("/dev/stdout"); |
| } |
| |
| sub createTextHeader{ |
| my ($text, $textattsref, $textHeader) = @_; |
| |
| # USE 01 binding_id="2246025" |
| # USE 02 date="2021-01-15" |
| # 03 datefrom="20210115" |
| # 04 dateto="20210115" |
| # 05 elec_date="_" |
| # 06 file="" |
| # USE 07 filename_metadata="finclarin_siirto_k2021/0039-5552/2021/alto/2246025_0039-5552_2021-01-15_SK0221_mets.xml" |
| # USE 08 filename_orig ="finclarin_siirto_k2021/0039-5552/2021/alto/2246025_0039-5552_2021-01-15_SK0221_page-2021011502210030301.xml |
| # USE 09 id="t-bcd0f3fa-bbd3dac4" |
| # 10 img_url="" |
| # USE 11 issue_date="15.01.2021" |
| # USE 12 issue_no="SK0221" |
| # USE 13 issue_title="Suomen Kuvalehti" |
| # USE 14 label="Suomen Kuvalehti no. SK0221 15.01.2021" |
| # USE 16 language="fi" |
| # USE 17 page_id="p1" |
| # USE 18 page_no="None" |
| # 19 part_name="_" |
| # 20 publ_id="0039-5552" |
| # 21 publ_part="" |
| # USE 22 publ_title="Suomen Kuvalehti" |
| # USE 23 publ_type="aikakausi" |
| # USE 24 sentcount="70" |
| # USE 25 sum_lang="|xxx:44|fin:23|eng:3|" |
| # 26 timefrom="000000" |
| # 27 timeto="235959" |
| # USE 28 tokencount="304" |
| # 29 version_added="KLK-fi-2021"> |
| |
| |
| my $BID = $textattsref->{'binding_id'}; |
| my $DATE = $textattsref->{'date'}; |
| my $METAFILENAME = $textattsref->{'filename_metadata'}; |
| my $ORIGFILENAME = $textattsref->{'filename_orig'}; |
| my $ID = $textattsref->{'id'}; |
| my $ISSUEDATE = $textattsref->{'issue_date'}; |
| my $ISSUENO = $textattsref->{'issue_no'}; |
| my $ISSUETITLE = $textattsref->{'issue_title'}; |
| my $LABEL = $textattsref->{'label'}; |
| my $LANGUAGE = $textattsref->{'language'}; |
| my $PAGEID = $textattsref->{'page_id'}; |
| my $PAGENO = $textattsref->{'page_no'}; |
| my $PUBLTITLE = $textattsref->{'publ_title'}; |
| my $PUBLTYPE = $textattsref->{'publ_type'}; |
| my $SENTCOUNT = $textattsref->{'sentcount'}; |
| my $SUMLANG = $textattsref->{'sum_lang'}; |
| my $TOKENCOUNT = $textattsref->{'tokencount'}; |
| |
| |
| #----------------------------- |
| # Derived Metadata variables |
| #----------------------------- |
| |
| my @datearray = split("-", $DATE); |
| my @langarray = split("|", $SUMLANG); |
| my @namearray = split(/[\.\/]/, $ORIGFILENAME); # use $namearray[4] as ID for the page |
| |
| |
| |
| #----------------------------------------------------------------------- |
| # CREATE text-teiHeader ACCORDING TO THE SKELETON in $textHeader |
| #----------------------------------------------------------------------- |
| |
| |
| $textHeader->paste('first_child', $text); |
| |
| #----------------------------------------------- |
| # <teiHeader> |
| # <fileDesc n="[EuReCo-KLK-FIN_$ID]"> |
| # <titleStmt> |
| # <title>[$LABEL, page $PAGENO]</title> |
| |
| $textHeader->first_child("fileDesc")->first_child("titleStmt")->first_child("title") |
| ->set_text($LABEL . ", Text #" . $textcounter); # Case KLK |
| |
| #----------------------------------------------- |
| # <fileDesc> |
| # <sourceDesc> |
| # <biblStruct> |
| # <analytic> |
| # <title type="main">[$LABEL, page $PAGENO]</title> |
| # <date>[$DATE]</date> |
| # <date type="year">TODO</date> |
| # <date type="month">TODO</date> |
| # <date type="day">TODO</date> |
| # <idno type="PAGEID">$PAGEID</idno> |
| # <idno type="BINDINGID">$BID</idno> |
| # <idno type="ID">$ID</idno> |
| # <idno type="KIELIPANKKI_METAFILENAME">$METAFILENAME</idno> |
| # <idno type="KIELIPANKKI_ORIGFILENAME">$ORIGFILENAME</idno> |
| # <textLang>$LANGUAGE</textLang> |
| # </analytic> |
| |
| my $analytic = $textHeader->get_xpath("./fileDesc/sourceDesc/biblStruct/analytic", 0); |
| |
| $analytic->first_child("title") ->set_text($LABEL . ", Text #" . $textcounter); # Case KLK |
| $analytic->get_xpath('./date[@type="year"]', 0) ->set_text($datearray[0]); |
| $analytic->get_xpath('./date[@type="month"]', 0) ->set_text($datearray[1]); |
| $analytic->get_xpath('./date[@type="day"]', 0) ->set_text($datearray[2]); |
| $analytic->get_xpath('./idno[@type="PAGEID"]', 0) ->set_text($PAGEID); |
| $analytic->get_xpath('./idno[@type="BINDINGID"]', 0) ->set_text($BID); |
| $analytic->get_xpath('./idno[@type="ID"]', 0) ->set_text($ID); |
| $analytic->get_xpath('./idno[@type="KIELIPANKKI_METAFILENAME"]', 0) ->set_text($METAFILENAME); |
| $analytic->get_xpath('./idno[@type="KIELIPANKKI_ORIGFILENAME"]', 0) ->set_text($ORIGFILENAME); |
| $analytic->first_child('textLang') ->set_text($LANGUAGE); |
| |
| # <monogr> |
| # <title>$PUBLTITLE</title> |
| # <imprint> |
| # <pubPlace>TODO</pubPlace> |
| # <publisher>TODO</publisher> |
| # </imprint> |
| # <biblScope unit="ISSUETITLE"/> |
| # <biblScope unit="ISSUENO"/> |
| # <biblScope unit="ISSUEDATE"/> |
| # <biblScope unit="pp">$PAGENO</biblScope> |
| # <monogr> |
| |
| my $monogr = $textHeader->get_xpath("./fileDesc/sourceDesc/biblStruct/monogr", 0); |
| |
| |
| |
| |
| |
| ##TMP # create <teiHeader> inside <TEI> |
| ##TMP my $teiHeader = XML::Twig::Elt->new('teiHeader'); |
| ##TMP # $teiHeader->paste('first_child', $text); |
| ##TMP |
| ##TMP ## insert_new_elt is a combo of new and paste, cf. xml::twig docu: |
| ##TMP ## insert_new_elt ($opt_position, $gi, $opt_atts_hashref, @opt_content) |
| ##TMP |
| ##TMP my $fileDesc = $teiHeader->insert_new_elt('fileDesc' => {n => "EuReCo_KLK-fi_" . $namearray[4]}); |
| ##TMP my $encodingDesc = $teiHeader->insert_new_elt("last_child", 'encodingDesc'); |
| ##TMP my $profileDesc = $teiHeader->insert_new_elt("last_child", 'profileDesc'); |
| ##TMP my $revisionDesc = $teiHeader->insert_new_elt("last_child", 'revisionDesc'); |
| ##TMP |
| ##TMP #--------------------- |
| ##TMP # fileDesc/titleStmt |
| ##TMP #--------------------- |
| ##TMP my $titleStmt = $fileDesc ->insert_new_elt('titleStmt'); |
| ##TMP my $title = $titleStmt->insert_new_elt("last_child", 'title'); |
| ##TMP my $respStmt = $titleStmt->insert_new_elt("last_child", 'respStmt'); |
| ##TMP my $resp = $respStmt ->insert_new_elt("last_child", 'resp'); |
| ##TMP my $name = $respStmt ->insert_new_elt("last_child", 'name'); |
| ##TMP |
| ##TMP # set texts for titleStmt |
| ##TMP # $title->set_text($LABEL . ", page " . $PAGENO); # Achtung - PAGENO scheint meist "None" zu sein |
| ##TMP $title->set_text($LABEL . ", Text #" . $textcounter); # at least for Suomen Kuvalehti |
| ##TMP $resp ->set_text("compiled by EuReCo"); |
| ##TMP $name ->set_text("EuReCo: HL"); |
| ##TMP |
| ##TMP #-------------------------- |
| ##TMP # fileDesc/publicationStmt |
| ##TMP #-------------------------- |
| ##TMP my $publicationStmt = $fileDesc ->insert_new_elt("last_child", 'publicationStmt'); |
| ##TMP my $distributor = $publicationStmt->insert_new_elt("last_child", 'distributor'); |
| ##TMP my $note = $distributor ->insert_new_elt("last_child", 'note'); |
| ##TMP my $availability = $publicationStmt->insert_new_elt("last_child", 'availability'); |
| ##TMP my $licence = $availability ->insert_new_elt("last_child", 'licence'); |
| ##TMP |
| ##TMP # set texts for publicationStmt |
| ##TMP $note ->set_text("NOT FOR DISTRIBUTION - to be used locally in EuReCo"); |
| ##TMP $licence->set_text("CLARIN_RES"); # TODO: Ausfuherlichere Licence info in KLK Metadata Record |
| ##TMP |
| ##TMP #------------------------------ |
| ##TMP # fileDesc/sourceDesc/biblStruct |
| ##TMP #------------------------------ |
| ##TMP my $sourceDesc = $fileDesc ->insert_new_elt("last_child", 'sourceDesc'); |
| ##TMP my $biblStruct = $sourceDesc->insert_new_elt("last_child", 'biblStruct'); |
| ##TMP |
| ##TMP # fileDesc/sourceDesc/biblStruct/analytic |
| ##TMP my $analytic = $biblStruct->insert_new_elt("last_child", 'analytic'); |
| ##TMP my $analytic_title = $analytic->insert_new_elt("last_child", 'title' => {type => "main"} ); |
| ##TMP # my $analytic_date = $analytic->insert_new_elt("last_child", 'date'); |
| ##TMP my $analytic_date_year = $analytic->insert_new_elt("last_child", 'date' => {type => "year"}); |
| ##TMP my $analytic_date_month = $analytic->insert_new_elt("last_child", 'date' => {type => "month"}); |
| ##TMP my $analytic_date_day = $analytic->insert_new_elt("last_child", 'date' => {type => "day"}); |
| ##TMP my $analytic_idno_pageid = $analytic->insert_new_elt("last_child", 'idno' => {type => "PAGEID"}); |
| ##TMP my $analytic_idno_bindingid = $analytic->insert_new_elt("last_child", 'idno' => {type => "BINDINGID"}); |
| ##TMP my $analytic_idno_id = $analytic->insert_new_elt("last_child", 'idno' => {type => "ID"}); |
| ##TMP my $analytic_idno_metafile = $analytic->insert_new_elt("last_child", 'idno' => {type => "KIELIPANKKI_METAFILENAME"}); |
| ##TMP my $analytic_idno_origfile = $analytic->insert_new_elt("last_child", 'idno' => {type => "KIELIPANKKI_ORIGFILENAME"}); |
| ##TMP my $analytic_textlang = $analytic->insert_new_elt("last_child", 'textLang'); |
| ##TMP |
| ##TMP # set texts for analytic |
| ##TMP # $analytic_title ->set_text($LABEL . ", page " . $PAGENO); # Achtung $PAGENO scheint meist "None zu sein" |
| ##TMP $analytic_title ->set_text($LABEL . ", Text #" . $textcounter); # Achtung $PAGENO scheint meist "None zu sein" |
| ##TMP # $analytic_date ->set_text($DATE); |
| ##TMP $analytic_date_year ->set_text($datearray[0]); |
| ##TMP $analytic_date_month ->set_text($datearray[1]); |
| ##TMP $analytic_date_day ->set_text($datearray[2]); |
| ##TMP $analytic_idno_pageid ->set_text($PAGEID); |
| ##TMP $analytic_idno_bindingid->set_text($BID); |
| ##TMP $analytic_idno_id ->set_text($ID); |
| ##TMP $analytic_idno_metafile ->set_text($METAFILENAME); |
| ##TMP $analytic_idno_origfile ->set_text($ORIGFILENAME); |
| ##TMP $analytic_textlang ->set_text($LANGUAGE); |
| ##TMP |
| ##TMP #------------------------------------- |
| ##TMP # fileDesc/sourceDesc/biblStruct/monogr |
| ##TMP #------------------------------------- |
| ##TMP my $monogr = $biblStruct->insert_new_elt("last_child", 'monogr'); |
| ##TMP my $monogr_title = $monogr ->insert_new_elt("last_child", 'title'); |
| ##TMP my $imprint = $monogr ->insert_new_elt("last_child", 'imprint'); # imprint is needed for valididty |
| ##TMP my $pubPlace = $imprint ->insert_new_elt("last_child", 'pubPlace'); # imprint is needed for validity |
| ##TMP my $publisher = $imprint ->insert_new_elt("last_child", 'publisher'); # imprint is needed for validity |
| ##TMP my $biblScope_issuetitle = $monogr ->insert_new_elt("last_child", 'biblScope' => {unit => 'ISSUETITLE'} ); |
| ##TMP my $biblScope_issueno = $monogr ->insert_new_elt("last_child", 'biblScope' => {unit => 'ISSUENO'} ); |
| ##TMP my $biblScope_issuedate = $monogr ->insert_new_elt("last_child", 'biblScope' => {unit => 'ISSUEDATE'} ); |
| ##TMP my $biblScope_pp = $monogr ->insert_new_elt("last_child", 'biblScope' => {unit => 'PAGENO'} ); # Achtung PAGENO ist meist "None" ? |
| ##TMP |
| ##TMP # set texts for monogr |
| ##TMP $monogr_title ->set_text($PUBLTITLE); |
| ##TMP $pubPlace ->set_text("TODO"); |
| ##TMP $pubPlace ->set_att("key",'FI'); |
| ##TMP $publisher ->set_text("TODO"); |
| ##TMP $biblScope_issuetitle->set_text($ISSUETITLE); |
| ##TMP $biblScope_issueno ->set_text($ISSUENO); |
| ##TMP $biblScope_issuedate ->set_text($ISSUEDATE); |
| ##TMP $biblScope_pp ->set_text($PAGENO); |
| ##TMP |
| ##TMP #--------------- |
| ##TMP # encodingDesc |
| ##TMP #--------------- |
| ##TMP my $tagsDecl = $encodingDesc->insert_new_elt("last_child", 'tagsDecl'); |
| ##TMP my $namespace = $tagsDecl ->insert_new_elt("last_child", 'namespace' => {name => 'http://www.tei-c.org/ns/1.0'}); |
| ##TMP my $tagUsage_s = $namespace ->insert_new_elt("last_child", 'tagUsage' => {gi => 's', occurs => $SENTCOUNT}); |
| ##TMP my $tagUsage_w = $namespace ->insert_new_elt("last_child", 'tagUsage' => {gi => 'w', occurs => $TOKENCOUNT}); |
| ##TMP |
| ##TMP #------------- |
| ##TMP # profileDesc |
| ##TMP #------------- |
| ##TMP my $langUsage = $profileDesc ->insert_new_elt("last_child", 'langUsage'); |
| ##TMP my $language = $langUsage ->insert_new_elt("last_child", 'language' => {ident => $LANGUAGE, usage => $SUMLANG}); |
| ##TMP # Achtung in @usage muss eigt. ein integer; am besten inhalt von SUMLANG aufdroeseln und mehrere <language> machen |
| ##TMP my $textClass = $profileDesc ->insert_new_elt("last_child", 'textClass'); |
| ##TMP my $classCode_fi = $textClass ->insert_new_elt("last_child", 'classCode' => {scheme => "KLK_PUBLTYPE"}); |
| ##TMP # my $classCode_en = $textClass ->insert_new_elt("last_child", 'classCode' => {scheme => "KLK_PUBLTYPE_MAPPED"}); |
| ##TMP |
| ##TMP #--------------------------- |
| ##TMP # set texts for profileDesc |
| ##TMP #--------------------------- |
| ##TMP $classCode_fi ->set_text($PUBLTYPE); |
| ##TMP # $classCode_en->set_text($PUBLTYPETRANSL); |
| ##TMP |
| ##TMP #--------------- |
| ##TMP # revisionDesc |
| ##TMP #--------------- |
| ##TMP my $change = $revisionDesc ->insert_new_elt("last_child", 'change' => {when => localtime->ymd('-'), who => 'HL' }); |
| ##TMP |
| ##TMP # set texts for revisionDesc |
| ##TMP $change->set_text("TEI version for EuReCo"); |
| |
| |
| |
| |
| |
| |
| |
| |
| ################################### |
| # END OF CREATING TEIHEADER |
| ################################### |
| |
| } |
| |
| sub setP { |
| my ($paragraph) = @_; |
| |
| $paragraph->set_gi('p'); |
| |
| # <paragraph id="p-bcd0f3fa-bbd3dac4-815ead7a" sum_lang="|fin:1|"> |
| # atts of <paragraph>: |
| # @id USE |
| # @sum_lang USE: put in xml:lang and prefix the value with "x-" for private value |
| |
| $paragraph->set_att("xml:lang", "x-" . $paragraph->att("sum_lang")); |
| $paragraph->del_att("sum_lang"); |
| # $paragraph->change_att_name('id', 'xml:id'); |
| $paragraph->del_att("id"); # diese id ist auch nicht eindeutig!! |
| } |
| sub setS { |
| my ($sentence) = @_; |
| |
| $sentence->set_gi('s'); |
| |
| # the atts of <sentence>: |
| # USE 1 @id="s-bcd0f3fa-bbd3dac4-f7429090" |
| # USE 2 @lang="fin" -> xml:lang |
| # ? 3 @lang_conf="0.6734853"> -> ToDo @cert ? |
| |
| # set attrs of <s> |
| $sentence->set_att("xml:lang", $sentence->att("lang")); # ToDo: convert the value / introduce a hash for lookup (input values: "fin", "xxx", ....) |
| # $sentence->change_att_name('id', 'xml:id'); # nicht eindeutig |
| $sentence->del_att('id'); |
| $sentence->del_att("lang"); # replaced by xml:lang |
| $sentence->del_att("lang_conf"); # for the time being |
| |
| } |
| |
| sub createW { |
| my ($w_element, $line) = @_; |
| |
| #--------------------------- |
| # Get the tags (=columns) |
| #--------------------------- |
| |
| my @tags = split(/\t/, $line); |
| |
| # set content of <w> i.e. the token |
| $w_element->set_text($tags[0]); |
| |
| # vrt positional-attributes in corpus KLK: |
| # USE [0] word |
| # USE [1] ref (id for reference of dephead) |
| # USE [2] lemma |
| # ? [3] lemmacomp (lemma with compound info - could go in @norm, as tag abuse?) |
| # USE [4] pos |
| # USE [5] msd |
| # USE [6] dephead |
| # USE [7] deprel |
| # [8] content (ocr-process) |
| # [9] vpos (ocr-process) |
| # [10] ocr (ocr-process) |
| # [11] cc (ocr-process) |
| # [12] hyph (ocr-process) |
| # [13] style (ocr-process) |
| # [14] lex (korp semantic disambiguation from G"oteborg) |
| |
| # set the attributes of <w>: |
| $w_element->set_att("n", $tags[1]); |
| # $w_element->set_att("id", "w_" . $namearray[4] . $sentence->att("xml:id") . "_" . $tags[1]); |
| # so zusammengebaute ID ist auch nicht eindeutig... |
| $w_element->del_att("id"); |
| $w_element->set_att("lemma", $tags[2]); |
| # $w_element->set_att("norm", $tags[3]); # tag abuse of @norm |
| $w_element->set_att("pos", $tags[4]); |
| $w_element->set_att("msd", $tags[5]); |
| #TMP $w_element->set_att("head", $tags[6]); |
| #TMP $w_element->set_att("deprel", $tags[7]); |
| |
| } |
| |
| |
| sub set_title{ |
| my ($corpusHeader, $source, $yy, $kielipankkiCorpus) = @_; |
| |
| my $cTitleString = $source . " " . $yy . ", from ". $kielipankkiCorpus . " for EuReCo"; |
| |
| #<teiHeader> |
| # <fileDesc> |
| # <titleStmt> |
| # <title>[Aamulehti2021] from [klk-fi-v2-vrt for] EuReCo</title> |
| # </titleStmt> |
| # <!-- ... --> |
| # </fileDesc> |
| #</teiHeader> |
| |
| my $cTitleNode = $corpusHeader->first_child("fileDesc")->first_child("titleStmt")->first_child("title"); |
| |
| $cTitleNode->set_text($cTitleString); |
| |
| } |
| |
| sub set_sourceDesc{ |
| my ($corpusHeader, $source, $yy, $kielipankkiCorpus) = @_; |
| |
| my $cBiblString = $source . " " . $yy . ", from ". $kielipankkiCorpus . " for EuReCo"; |
| |
| #<teiHeader> |
| # <fileDesc> |
| # <!-- ... --> |
| # <sourceDesc> |
| # <bibl>[Aamulehti2021] from [klk-fi-v2-vrt]</bibl> |
| # </sourceDesc> |
| # <!-- ... --> |
| # </fileDesc> |
| #</teiHeader> |
| |
| my $cBiblNode = $corpusHeader->first_child("fileDesc")->first_child("sourceDesc")->first_child("bibl"); |
| |
| $cBiblNode->set_text($cBiblString); |
| } |
| |
| |
| |
| |
| |
| |
| |
| ################# |
| ## usage_message |
| ################# |
| |
| |
| sub usage_message { |
| print " Usage: ./vrt2tei.pl <file.vrt.xml> <outfile>\n"; |
| print " <file.vrt.xml> is a VRT file converted to proper XML\n"; |
| exit; |
| } |
| |
| |