blob: 6ea85c61c56fb70a6287cfffce92b35d938c92ac [file] [log] [blame]
import os
import xml.etree.ElementTree as ET
path = '/home/spassova/BGCorpusExamples'
files = os.listdir(path)
tree = ET.parse(path + '/' + files[0])
root = tree.getroot()
titles = root.findall('.//*[@type='title']')
domains = root.findall('.//*[@type='domain']')
pageURLs = root.findall('.//*[@type='pageURL']')
ids = root.findall('.//*[@type='id']')
mainImageURLs = root.findall('.//*[@type='mainImageURL']')
mainImageTexts = root.findall('.//*[@type='mainImageTexts']')
mainImageSources = root.findall('.//*[@type='mainImageSources']')
authors = root.findall('.//*[@type='authors']')
authorURLs = root.findall('.//*[@type='authorURLs']')
categories = root.findall('.//*[@type='category']')
subCategories = root.findall('.//*[@type='subCategory']')
tags = root.findall('.//*[@type='tags']')
datesPublished = root.findall('.//*[@type='datePublished']')
timesPublished = root.findall('.//*[@type='timePubished']')
datesModified = root.findall('.//*[@type='dateModified']')
timesModified = root.findall('.//*[@type='timeModified']')
number_of_texts = len(root.findall('{http://www.tei-c.org/ns/1.0}text'))
for elem in root.findall('*'):
root.remove(elem)
root.tag = 'teiCorpus'
for i in range(number_of_texts):
tei = ET.SubElement(root, 'TEI')
teiHeader = ET.SubElement(tei, 'teiHeader')
fileDesc = ET.SubElement(teiHeader, 'fileDesc')
titleStmt = ET.SubElement(fileDesc, 'titleStmt')
textSigle = ET.SubElement(titleStmt, 'textSigle')
textSigle.text = 'BNC/TST.' + f'{i:05}'
sourceDesc = ET.SubElement(fileDesc, 'sourceDesc')
ET.indent(tree, ' ')
ET.register_namespace('', 'http://www.tei-c.org/ns/1.0')
tree.write('04_output.xml', encoding='utf-8', xml_declaration=True, method='xml', short_empty_elements=True)