blob: d143470c4440f9d2f85b7eb229fc78ce00bb8fa0 [file] [log] [blame]
lora-spab4e0ea2023-03-10 12:02:24 +01001import os, sys
lora-sp66978642023-03-08 11:02:52 +01002import xml.etree.ElementTree as ET
lora-spab4e0ea2023-03-10 12:02:24 +01003from xml.dom import minidom
4
lora-sp66978642023-03-08 11:02:52 +01005
lora-sp4201a5e2023-03-09 16:19:57 +01006def main():
lora-spab4e0ea2023-03-10 12:02:24 +01007 # Create corpus structure from string and save into file
8 corpus = "<teiCorpus>\n</teiCorpus>"
9 origRoot = ET.fromstring(corpus)
10 corpusStr = minidom.parseString(ET.tostring(origRoot)).toprettyxml(indent=" ")
11 with open("tree_structure.xml", "w") as f:
12 f.write(corpusStr)
lora-sp132c3e52023-03-09 16:32:37 +010013
lora-spab4e0ea2023-03-10 12:02:24 +010014 # Process all documents and append to corpusTree
15 #path = "./BGCorpusExamples/"
16 #files = os.listdir(path)
17 process(0, sys.argv[1])
18 #process(sys.argv[2])
19 # Parse corpus tree, indent and output
20 corpusTree = ET.parse("tree_structure.xml")
lora-sp132c3e52023-03-09 16:32:37 +010021 ET.indent(corpusTree, " ")
lora-spab4e0ea2023-03-10 12:02:24 +010022 corpusTree.write("output.p5.xml", encoding='utf-8', xml_declaration=True, method='xml', short_empty_elements=True)
lora-sp66978642023-03-08 11:02:52 +010023
lora-spab4e0ea2023-03-10 12:02:24 +010024
25def process(j, file):
26 #j = 0
27 # Parse corpus tree and get corpus root
28 corpusTree = ET.parse("tree_structure.xml")
29 corpusRoot = corpusTree.getroot()
30
31 # Parse document tree and get root
lora-sp4201a5e2023-03-09 16:19:57 +010032 tree = ET.parse(file)
33 root = tree.getroot()
lora-spab4e0ea2023-03-10 12:02:24 +010034
35 # Store metadata and texts in lists
lora-sp4201a5e2023-03-09 16:19:57 +010036 titles = root.findall(".//*[@type='title']")
lora-spab4e0ea2023-03-10 12:02:24 +010037 #domains = root.findall(".//*[@type='domain']")
lora-sp4201a5e2023-03-09 16:19:57 +010038 pageURLs = root.findall(".//*[@type='pageURL']")
lora-spab4e0ea2023-03-10 12:02:24 +010039 #ids = root.findall(".//*[@type='id']")
40 #mainImageURLs = root.findall(".//*[@type='mainImageURL']")
41 #mainImageTexts = root.findall(".//*[@type='mainImageTexts']")
42 #mainImageSources = root.findall(".//*[@type='mainImageSources']")
lora-sp4201a5e2023-03-09 16:19:57 +010043 authors = root.findall(".//*[@type='authors']")
lora-spab4e0ea2023-03-10 12:02:24 +010044 #authorURLs = root.findall(".//*[@type='authorURLs']")
45 #categories = root.findall(".//*[@type='category']")
46 #subCategories = root.findall(".//*[@type='subCategory']")
47 #tags = root.findall(".//*[@type='tags']")
lora-sp4201a5e2023-03-09 16:19:57 +010048 datesPublished = root.findall(".//*[@type='datePublished']")
49 timesPublished = root.findall(".//*[@type='timePublished']")
lora-spab4e0ea2023-03-10 12:02:24 +010050 #datesModified = root.findall(".//*[@type='dateModified']")
51 #timesModified = root.findall(".//*[@type='timeModified']")
52 #mainImageWidths = root.findall(".//*[@type='mainImageWidth']")
53 #mainImageHeights = root.findall(".//*[@type='mainImageHeight']")
54 #mainImageThumbnailURLs = root.findall(".//*[@type='mainImageThumbnailURL']")
lora-sp4201a5e2023-03-09 16:19:57 +010055 texts = []
lora-spab4e0ea2023-03-10 12:02:24 +010056
57 # Count text elements and remove metadata
lora-sp4201a5e2023-03-09 16:19:57 +010058 number_of_texts = 0
59 for text in root.iter("{http://www.tei-c.org/ns/1.0}text"):
60 number_of_texts+=1
61 for body in text:
62 for div1 in body:
63 for div2 in div1:
64 for div3 in div2:
65 if div3.get('type') == "metadata":
66 div2.remove(div3)
lora-sp82511632023-03-09 09:12:17 +010067
lora-sp4201a5e2023-03-09 16:19:57 +010068 texts.append(div2)
lora-spab4e0ea2023-03-10 12:02:24 +010069
70 # Remove all elements from root
lora-sp4201a5e2023-03-09 16:19:57 +010071 for elem in root.findall("*"):
72 root.remove(elem)
lora-spab4e0ea2023-03-10 12:02:24 +010073
74 # Rename root
lora-sp4201a5e2023-03-09 16:19:57 +010075 root.tag = "teiDoc"
lora-spab4e0ea2023-03-10 12:02:24 +010076
77 # Create target structure
lora-sp4201a5e2023-03-09 16:19:57 +010078 for i in range(number_of_texts):
79 tei = ET.SubElement(root, "TEI")
80 teiHeader = ET.SubElement(tei, "teiHeader")
81 fileDesc = ET.SubElement(teiHeader, "fileDesc")
82 titleStmt = ET.SubElement(fileDesc, "titleStmt")
83 textSigle = ET.SubElement(titleStmt, "textSigle")
lora-spab4e0ea2023-03-10 12:02:24 +010084 textSigle.text = "BNC/" + f"{j:03}" + "." + f"{i:05}"
lora-sp4201a5e2023-03-09 16:19:57 +010085 sourceDesc = ET.SubElement(fileDesc, "sourceDesc")
86 analytic = ET.SubElement(sourceDesc, "analytic")
87 htitle = ET.SubElement(analytic, "h.title")
88 htitle.text = titles[i].text
89 hauthor = ET.SubElement(analytic, "h.author")
90 hauthor.text = authors[i].text
91 imprint = ET.SubElement(sourceDesc, "imprint")
92 pubDateYear = ET.SubElement(imprint, "pubDate")
93 pubDateYear.set("type", "year")
94 pubDateYear.text = datesPublished[i].text[0:4]
95 pubDateMonth = ET.SubElement(imprint, "pubDate")
96 pubDateMonth.set("type", "month")
97 pubDateMonth.text = datesPublished[i].text[5:7]
98 pubDateDay = ET.SubElement(imprint, "pubDate")
99 pubDateDay.set("type", "day")
100 pubDateDay.text = datesPublished[i].text[8:10]
101 pubDateTime = ET.SubElement(imprint, "pubDate")
102 pubDateTime.set("type", "time")
103 pubDateTime.text = timesPublished[i].text
104 pubPlace = ET.SubElement(imprint, "pubPlace")
105 ref = ET.SubElement(pubPlace, "ref")
106 ref.set("type", "page_url")
107 ref.set("target", pageURLs[i].text)
108 text = ET.SubElement(tei, "text")
109 body = ET.SubElement(text, "body")
110 for p in texts[i]:
111 body.append(p)
lora-sp82511632023-03-09 09:12:17 +0100112
lora-sp132c3e52023-03-09 16:32:37 +0100113
114 corpusRoot.append(root)
lora-sp4201a5e2023-03-09 16:19:57 +0100115 ET.register_namespace("", "http://www.tei-c.org/ns/1.0")
lora-sp132c3e52023-03-09 16:32:37 +0100116
lora-sp132c3e52023-03-09 16:32:37 +0100117
lora-spab4e0ea2023-03-10 12:02:24 +0100118if __name__ == "__main__":
119 main()