blob: bf5bf4f70ed277c9e91b810613cca736b8608a09 [file] [log] [blame]
lora-sp66978642023-03-08 11:02:52 +01001import os
2import xml.etree.ElementTree as ET
3
lora-sp4201a5e2023-03-09 16:19:57 +01004def main():
5 corpus = "<teiCorpus></teiCorpus>"
lora-sp132c3e52023-03-09 16:32:37 +01006 corpusTree = ET.parse(corpus)
7 corpusRoot = ET.fromstring(corpus)
lora-sp4201a5e2023-03-09 16:19:57 +01008 path = "/home/spassova/BGCorpusExamples/"
9 files = os.listdir(path)
10 for j in range(len(files)):
lora-sp132c3e52023-03-09 16:32:37 +010011 processing(path + files[j])
12
13 ET.indent(corpusTree, " ")
14 corpusTree.write(f"{j:02}" + "_" + "output.xml", encoding="utf-8", xml_declaration=True, method="xml", short_empty_elements=True)
lora-sp4201a5e2023-03-09 16:19:57 +010015
lora-sp66978642023-03-08 11:02:52 +010016
lora-sp4201a5e2023-03-09 16:19:57 +010017def processing(file):
lora-spd9b00682023-03-09 10:18:20 +010018# Parse tree and get root
lora-sp4201a5e2023-03-09 16:19:57 +010019 tree = ET.parse(file)
20 root = tree.getroot()
lora-spd9b00682023-03-09 10:18:20 +010021# Store metadata and texts in lists
lora-sp4201a5e2023-03-09 16:19:57 +010022 titles = root.findall(".//*[@type='title']")
23 domains = root.findall(".//*[@type='domain']")
24 pageURLs = root.findall(".//*[@type='pageURL']")
25 ids = root.findall(".//*[@type='id']")
26 mainImageURLs = root.findall(".//*[@type='mainImageURL']")
27 mainImageTexts = root.findall(".//*[@type='mainImageTexts']")
28 mainImageSources = root.findall(".//*[@type='mainImageSources']")
29 authors = root.findall(".//*[@type='authors']")
30 authorURLs = root.findall(".//*[@type='authorURLs']")
31 categories = root.findall(".//*[@type='category']")
32 subCategories = root.findall(".//*[@type='subCategory']")
33 tags = root.findall(".//*[@type='tags']")
34 datesPublished = root.findall(".//*[@type='datePublished']")
35 timesPublished = root.findall(".//*[@type='timePublished']")
36 datesModified = root.findall(".//*[@type='dateModified']")
37 timesModified = root.findall(".//*[@type='timeModified']")
38 mainImageWidths = root.findall(".//*[@type='mainImageWidth']")
39 mainImageHeights = root.findall(".//*[@type='mainImageHeight']")
40 mainImageThumbnailURLs = root.findall(".//*[@type='mainImageThumbnailURL']")
41 texts = []
lora-spd9b00682023-03-09 10:18:20 +010042# Count text elements and remove metadata
lora-sp4201a5e2023-03-09 16:19:57 +010043 number_of_texts = 0
44 for text in root.iter("{http://www.tei-c.org/ns/1.0}text"):
45 number_of_texts+=1
46 for body in text:
47 for div1 in body:
48 for div2 in div1:
49 for div3 in div2:
50 if div3.get('type') == "metadata":
51 div2.remove(div3)
lora-sp82511632023-03-09 09:12:17 +010052
lora-sp4201a5e2023-03-09 16:19:57 +010053 texts.append(div2)
lora-spd9b00682023-03-09 10:18:20 +010054# Remove all elements from root
lora-sp4201a5e2023-03-09 16:19:57 +010055 for elem in root.findall("*"):
56 root.remove(elem)
lora-spd9b00682023-03-09 10:18:20 +010057# Rename root
lora-sp4201a5e2023-03-09 16:19:57 +010058 root.tag = "teiDoc"
lora-spd9b00682023-03-09 10:18:20 +010059# Create i5 structure
lora-sp4201a5e2023-03-09 16:19:57 +010060 for i in range(number_of_texts):
61 tei = ET.SubElement(root, "TEI")
62 teiHeader = ET.SubElement(tei, "teiHeader")
63 fileDesc = ET.SubElement(teiHeader, "fileDesc")
64 titleStmt = ET.SubElement(fileDesc, "titleStmt")
65 textSigle = ET.SubElement(titleStmt, "textSigle")
66 textSigle.text = "BNC/" + f"{j:02}" + "." + f"{i:05}"
67 sourceDesc = ET.SubElement(fileDesc, "sourceDesc")
68 analytic = ET.SubElement(sourceDesc, "analytic")
69 htitle = ET.SubElement(analytic, "h.title")
70 htitle.text = titles[i].text
71 hauthor = ET.SubElement(analytic, "h.author")
72 hauthor.text = authors[i].text
73 imprint = ET.SubElement(sourceDesc, "imprint")
74 pubDateYear = ET.SubElement(imprint, "pubDate")
75 pubDateYear.set("type", "year")
76 pubDateYear.text = datesPublished[i].text[0:4]
77 pubDateMonth = ET.SubElement(imprint, "pubDate")
78 pubDateMonth.set("type", "month")
79 pubDateMonth.text = datesPublished[i].text[5:7]
80 pubDateDay = ET.SubElement(imprint, "pubDate")
81 pubDateDay.set("type", "day")
82 pubDateDay.text = datesPublished[i].text[8:10]
83 pubDateTime = ET.SubElement(imprint, "pubDate")
84 pubDateTime.set("type", "time")
85 pubDateTime.text = timesPublished[i].text
86 pubPlace = ET.SubElement(imprint, "pubPlace")
87 ref = ET.SubElement(pubPlace, "ref")
88 ref.set("type", "page_url")
89 ref.set("target", pageURLs[i].text)
90 text = ET.SubElement(tei, "text")
91 body = ET.SubElement(text, "body")
92 for p in texts[i]:
93 body.append(p)
lora-sp82511632023-03-09 09:12:17 +010094
lora-sp132c3e52023-03-09 16:32:37 +010095
96 corpusRoot.append(root)
lora-sp4201a5e2023-03-09 16:19:57 +010097 ET.register_namespace("", "http://www.tei-c.org/ns/1.0")
lora-sp132c3e52023-03-09 16:32:37 +010098
99 return
100
101