blob: 44d1cecd34dbfa3a96252545f4cf2048d9eb5088 [file] [log] [blame]
lora-spab4e0ea2023-03-10 12:02:24 +01001import os, sys
lora-sp66978642023-03-08 11:02:52 +01002import xml.etree.ElementTree as ET
lora-spab4e0ea2023-03-10 12:02:24 +01003from xml.dom import minidom
lora-sp09a58a02023-03-10 16:33:46 +01004from lxml import etree
5from io import StringIO
lora-spab4e0ea2023-03-10 12:02:24 +01006
lora-sp66978642023-03-08 11:02:52 +01007
lora-sp4201a5e2023-03-09 16:19:57 +01008def main():
lora-spab4e0ea2023-03-10 12:02:24 +01009 # Create corpus structure from string and save into file
10 corpus = "<teiCorpus>\n</teiCorpus>"
11 origRoot = ET.fromstring(corpus)
12 corpusStr = minidom.parseString(ET.tostring(origRoot)).toprettyxml(indent=" ")
lora-spfb3b5bc2023-03-13 11:59:34 +010013 with open("/input/tree_structure.xml", "w") as f:
lora-spab4e0ea2023-03-10 12:02:24 +010014 f.write(corpusStr)
lora-sp132c3e52023-03-09 16:32:37 +010015
lora-sp09a58a02023-03-10 16:33:46 +010016 # Parse corpus tree
lora-spfb3b5bc2023-03-13 11:59:34 +010017 corpusTree = ET.parse("/input/tree_structure.xml")
lora-sp09a58a02023-03-10 16:33:46 +010018 corpusRoot = corpusTree.getroot()
19
20 # Process documents and append to corpus tree
21 for j in range(1, len(sys.argv)):
22 try:
lora-spfb3b5bc2023-03-13 11:59:34 +010023 currentTree = convert(j-1, sys.argv[j])
lora-sp09a58a02023-03-10 16:33:46 +010024 currentRoot = currentTree.getroot()
25 #print(currentRoot.tag)
26 corpusRoot.append(currentRoot)
27 except:
28 print("sorry")
29 continue
30
31 # Indent and save tree
lora-sp132c3e52023-03-09 16:32:37 +010032 ET.indent(corpusTree, " ")
lora-spfb3b5bc2023-03-13 11:59:34 +010033 corpusTree.write("/output/corpus.p5.xml", encoding='utf-8', xml_declaration=True, method='xml', short_empty_elements=True)
lora-sp66978642023-03-08 11:02:52 +010034
lora-spab4e0ea2023-03-10 12:02:24 +010035
lora-spfb3b5bc2023-03-13 11:59:34 +010036def convert(j, file):
lora-spab4e0ea2023-03-10 12:02:24 +010037 # Parse document tree and get root
lora-sp4201a5e2023-03-09 16:19:57 +010038 tree = ET.parse(file)
39 root = tree.getroot()
lora-sp09a58a02023-03-10 16:33:46 +010040 ET.register_namespace("", "http://www.tei-c.org/ns/1.0")
41
lora-spab4e0ea2023-03-10 12:02:24 +010042 # Store metadata and texts in lists
lora-sp4201a5e2023-03-09 16:19:57 +010043 titles = root.findall(".//*[@type='title']")
lora-spab4e0ea2023-03-10 12:02:24 +010044 #domains = root.findall(".//*[@type='domain']")
lora-sp4201a5e2023-03-09 16:19:57 +010045 pageURLs = root.findall(".//*[@type='pageURL']")
lora-spab4e0ea2023-03-10 12:02:24 +010046 #ids = root.findall(".//*[@type='id']")
47 #mainImageURLs = root.findall(".//*[@type='mainImageURL']")
48 #mainImageTexts = root.findall(".//*[@type='mainImageTexts']")
49 #mainImageSources = root.findall(".//*[@type='mainImageSources']")
lora-sp4201a5e2023-03-09 16:19:57 +010050 authors = root.findall(".//*[@type='authors']")
lora-spab4e0ea2023-03-10 12:02:24 +010051 #authorURLs = root.findall(".//*[@type='authorURLs']")
52 #categories = root.findall(".//*[@type='category']")
53 #subCategories = root.findall(".//*[@type='subCategory']")
54 #tags = root.findall(".//*[@type='tags']")
lora-sp4201a5e2023-03-09 16:19:57 +010055 datesPublished = root.findall(".//*[@type='datePublished']")
56 timesPublished = root.findall(".//*[@type='timePublished']")
lora-spab4e0ea2023-03-10 12:02:24 +010057 #datesModified = root.findall(".//*[@type='dateModified']")
58 #timesModified = root.findall(".//*[@type='timeModified']")
59 #mainImageWidths = root.findall(".//*[@type='mainImageWidth']")
60 #mainImageHeights = root.findall(".//*[@type='mainImageHeight']")
61 #mainImageThumbnailURLs = root.findall(".//*[@type='mainImageThumbnailURL']")
lora-sp4201a5e2023-03-09 16:19:57 +010062 texts = []
lora-spab4e0ea2023-03-10 12:02:24 +010063
64 # Count text elements and remove metadata
lora-sp4201a5e2023-03-09 16:19:57 +010065 number_of_texts = 0
66 for text in root.iter("{http://www.tei-c.org/ns/1.0}text"):
67 number_of_texts+=1
68 for body in text:
69 for div1 in body:
70 for div2 in div1:
71 for div3 in div2:
72 if div3.get('type') == "metadata":
73 div2.remove(div3)
lora-sp82511632023-03-09 09:12:17 +010074
lora-sp4201a5e2023-03-09 16:19:57 +010075 texts.append(div2)
lora-spab4e0ea2023-03-10 12:02:24 +010076
77 # Remove all elements from root
lora-sp4201a5e2023-03-09 16:19:57 +010078 for elem in root.findall("*"):
79 root.remove(elem)
lora-spab4e0ea2023-03-10 12:02:24 +010080
81 # Rename root
lora-sp4201a5e2023-03-09 16:19:57 +010082 root.tag = "teiDoc"
lora-spab4e0ea2023-03-10 12:02:24 +010083
84 # Create target structure
lora-sp4201a5e2023-03-09 16:19:57 +010085 for i in range(number_of_texts):
86 tei = ET.SubElement(root, "TEI")
87 teiHeader = ET.SubElement(tei, "teiHeader")
88 fileDesc = ET.SubElement(teiHeader, "fileDesc")
89 titleStmt = ET.SubElement(fileDesc, "titleStmt")
90 textSigle = ET.SubElement(titleStmt, "textSigle")
lora-spab4e0ea2023-03-10 12:02:24 +010091 textSigle.text = "BNC/" + f"{j:03}" + "." + f"{i:05}"
lora-sp4201a5e2023-03-09 16:19:57 +010092 sourceDesc = ET.SubElement(fileDesc, "sourceDesc")
93 analytic = ET.SubElement(sourceDesc, "analytic")
94 htitle = ET.SubElement(analytic, "h.title")
95 htitle.text = titles[i].text
96 hauthor = ET.SubElement(analytic, "h.author")
97 hauthor.text = authors[i].text
98 imprint = ET.SubElement(sourceDesc, "imprint")
99 pubDateYear = ET.SubElement(imprint, "pubDate")
100 pubDateYear.set("type", "year")
101 pubDateYear.text = datesPublished[i].text[0:4]
102 pubDateMonth = ET.SubElement(imprint, "pubDate")
103 pubDateMonth.set("type", "month")
104 pubDateMonth.text = datesPublished[i].text[5:7]
105 pubDateDay = ET.SubElement(imprint, "pubDate")
106 pubDateDay.set("type", "day")
107 pubDateDay.text = datesPublished[i].text[8:10]
108 pubDateTime = ET.SubElement(imprint, "pubDate")
109 pubDateTime.set("type", "time")
110 pubDateTime.text = timesPublished[i].text
111 pubPlace = ET.SubElement(imprint, "pubPlace")
112 ref = ET.SubElement(pubPlace, "ref")
113 ref.set("type", "page_url")
114 ref.set("target", pageURLs[i].text)
115 text = ET.SubElement(tei, "text")
116 body = ET.SubElement(text, "body")
117 for p in texts[i]:
118 body.append(p)
lora-sp82511632023-03-09 09:12:17 +0100119
lora-sp09a58a02023-03-10 16:33:46 +0100120 return tree
lora-sp132c3e52023-03-09 16:32:37 +0100121
lora-sp132c3e52023-03-09 16:32:37 +0100122
lora-sp09a58a02023-03-10 16:33:46 +0100123main()