blob: 6ea85c61c56fb70a6287cfffce92b35d938c92ac [file] [log] [blame]
Lora Spassova9dde8ef2023-03-09 08:20:41 +01001import os
2import xml.etree.ElementTree as ET
3
4path = '/home/spassova/BGCorpusExamples'
5files = os.listdir(path)
6
7tree = ET.parse(path + '/' + files[0])
8root = tree.getroot()
9
10titles = root.findall('.//*[@type='title']')
11domains = root.findall('.//*[@type='domain']')
12pageURLs = root.findall('.//*[@type='pageURL']')
13ids = root.findall('.//*[@type='id']')
14mainImageURLs = root.findall('.//*[@type='mainImageURL']')
15mainImageTexts = root.findall('.//*[@type='mainImageTexts']')
16mainImageSources = root.findall('.//*[@type='mainImageSources']')
17authors = root.findall('.//*[@type='authors']')
18authorURLs = root.findall('.//*[@type='authorURLs']')
19categories = root.findall('.//*[@type='category']')
20subCategories = root.findall('.//*[@type='subCategory']')
21tags = root.findall('.//*[@type='tags']')
22datesPublished = root.findall('.//*[@type='datePublished']')
23timesPublished = root.findall('.//*[@type='timePubished']')
24datesModified = root.findall('.//*[@type='dateModified']')
25timesModified = root.findall('.//*[@type='timeModified']')
26
27number_of_texts = len(root.findall('{http://www.tei-c.org/ns/1.0}text'))
28
29for elem in root.findall('*'):
30 root.remove(elem)
31
32root.tag = 'teiCorpus'
33
34for i in range(number_of_texts):
35 tei = ET.SubElement(root, 'TEI')
36 teiHeader = ET.SubElement(tei, 'teiHeader')
37 fileDesc = ET.SubElement(teiHeader, 'fileDesc')
38 titleStmt = ET.SubElement(fileDesc, 'titleStmt')
39 textSigle = ET.SubElement(titleStmt, 'textSigle')
40 textSigle.text = 'BNC/TST.' + f'{i:05}'
41 sourceDesc = ET.SubElement(fileDesc, 'sourceDesc')
42
43
44
45
46
47
48
49
50ET.indent(tree, ' ')
51ET.register_namespace('', 'http://www.tei-c.org/ns/1.0')
52tree.write('04_output.xml', encoding='utf-8', xml_declaration=True, method='xml', short_empty_elements=True)