Lora Spassova | 9dde8ef | 2023-03-09 08:20:41 +0100 | [diff] [blame^] | 1 | import os |
| 2 | import xml.etree.ElementTree as ET |
| 3 | |
| 4 | path = '/home/spassova/BGCorpusExamples' |
| 5 | files = os.listdir(path) |
| 6 | |
| 7 | tree = ET.parse(path + '/' + files[0]) |
| 8 | root = tree.getroot() |
| 9 | |
| 10 | titles = root.findall('.//*[@type='title']') |
| 11 | domains = root.findall('.//*[@type='domain']') |
| 12 | pageURLs = root.findall('.//*[@type='pageURL']') |
| 13 | ids = root.findall('.//*[@type='id']') |
| 14 | mainImageURLs = root.findall('.//*[@type='mainImageURL']') |
| 15 | mainImageTexts = root.findall('.//*[@type='mainImageTexts']') |
| 16 | mainImageSources = root.findall('.//*[@type='mainImageSources']') |
| 17 | authors = root.findall('.//*[@type='authors']') |
| 18 | authorURLs = root.findall('.//*[@type='authorURLs']') |
| 19 | categories = root.findall('.//*[@type='category']') |
| 20 | subCategories = root.findall('.//*[@type='subCategory']') |
| 21 | tags = root.findall('.//*[@type='tags']') |
| 22 | datesPublished = root.findall('.//*[@type='datePublished']') |
| 23 | timesPublished = root.findall('.//*[@type='timePubished']') |
| 24 | datesModified = root.findall('.//*[@type='dateModified']') |
| 25 | timesModified = root.findall('.//*[@type='timeModified']') |
| 26 | |
| 27 | number_of_texts = len(root.findall('{http://www.tei-c.org/ns/1.0}text')) |
| 28 | |
| 29 | for elem in root.findall('*'): |
| 30 | root.remove(elem) |
| 31 | |
| 32 | root.tag = 'teiCorpus' |
| 33 | |
| 34 | for i in range(number_of_texts): |
| 35 | tei = ET.SubElement(root, 'TEI') |
| 36 | teiHeader = ET.SubElement(tei, 'teiHeader') |
| 37 | fileDesc = ET.SubElement(teiHeader, 'fileDesc') |
| 38 | titleStmt = ET.SubElement(fileDesc, 'titleStmt') |
| 39 | textSigle = ET.SubElement(titleStmt, 'textSigle') |
| 40 | textSigle.text = 'BNC/TST.' + f'{i:05}' |
| 41 | sourceDesc = ET.SubElement(fileDesc, 'sourceDesc') |
| 42 | |
| 43 | |
| 44 | |
| 45 | |
| 46 | |
| 47 | |
| 48 | |
| 49 | |
| 50 | ET.indent(tree, ' ') |
| 51 | ET.register_namespace('', 'http://www.tei-c.org/ns/1.0') |
| 52 | tree.write('04_output.xml', encoding='utf-8', xml_declaration=True, method='xml', short_empty_elements=True) |