blob: 055f45c16dbfe7c1f6e4b24c32d57bdfe6853247 [file] [log] [blame]
lora-spab4e0ea2023-03-10 12:02:24 +01001import os, sys
lora-sp66978642023-03-08 11:02:52 +01002import xml.etree.ElementTree as ET
lora-spab4e0ea2023-03-10 12:02:24 +01003from xml.dom import minidom
lora-sp09a58a02023-03-10 16:33:46 +01004from lxml import etree
5from io import StringIO
lora-spab4e0ea2023-03-10 12:02:24 +01006
lora-sp66978642023-03-08 11:02:52 +01007
lora-sp4201a5e2023-03-09 16:19:57 +01008def main():
lora-spab4e0ea2023-03-10 12:02:24 +01009 # Create corpus structure from string and save into file
10 corpus = "<teiCorpus>\n</teiCorpus>"
11 origRoot = ET.fromstring(corpus)
12 corpusStr = minidom.parseString(ET.tostring(origRoot)).toprettyxml(indent=" ")
lora-sp4eea8a62023-03-13 14:08:43 +010013 if not os.path.exists('input'):
14 os.makedirs("input")
15 if not os.path.exists('output'):
16 os.makedirs("output")
17 with open("input/tree_structure.xml", "w") as f:
lora-spab4e0ea2023-03-10 12:02:24 +010018 f.write(corpusStr)
lora-sp132c3e52023-03-09 16:32:37 +010019
lora-sp09a58a02023-03-10 16:33:46 +010020 # Parse corpus tree
lora-sp4eea8a62023-03-13 14:08:43 +010021 corpusTree = ET.parse("input/tree_structure.xml")
lora-sp09a58a02023-03-10 16:33:46 +010022 corpusRoot = corpusTree.getroot()
23
24 # Process documents and append to corpus tree
25 for j in range(1, len(sys.argv)):
26 try:
lora-spfb3b5bc2023-03-13 11:59:34 +010027 currentTree = convert(j-1, sys.argv[j])
lora-sp09a58a02023-03-10 16:33:46 +010028 currentRoot = currentTree.getroot()
29 #print(currentRoot.tag)
30 corpusRoot.append(currentRoot)
31 except:
lora-spa1586402023-03-13 15:58:30 +010032 print(sys.argv[j])
lora-sp09a58a02023-03-10 16:33:46 +010033 continue
34
35 # Indent and save tree
lora-sp132c3e52023-03-09 16:32:37 +010036 ET.indent(corpusTree, " ")
lora-sp4eea8a62023-03-13 14:08:43 +010037 corpusTree.write("output/corpus.p5.xml", encoding='utf-8', xml_declaration=True, method='xml', short_empty_elements=True)
lora-sp66978642023-03-08 11:02:52 +010038
lora-spab4e0ea2023-03-10 12:02:24 +010039
lora-spfb3b5bc2023-03-13 11:59:34 +010040def convert(j, file):
lora-spab4e0ea2023-03-10 12:02:24 +010041 # Parse document tree and get root
lora-sp4201a5e2023-03-09 16:19:57 +010042 tree = ET.parse(file)
43 root = tree.getroot()
lora-sp09a58a02023-03-10 16:33:46 +010044 ET.register_namespace("", "http://www.tei-c.org/ns/1.0")
45
lora-spab4e0ea2023-03-10 12:02:24 +010046 # Store metadata and texts in lists
lora-sp4201a5e2023-03-09 16:19:57 +010047 titles = root.findall(".//*[@type='title']")
lora-spab4e0ea2023-03-10 12:02:24 +010048 #domains = root.findall(".//*[@type='domain']")
lora-sp4201a5e2023-03-09 16:19:57 +010049 pageURLs = root.findall(".//*[@type='pageURL']")
lora-spab4e0ea2023-03-10 12:02:24 +010050 #ids = root.findall(".//*[@type='id']")
51 #mainImageURLs = root.findall(".//*[@type='mainImageURL']")
52 #mainImageTexts = root.findall(".//*[@type='mainImageTexts']")
53 #mainImageSources = root.findall(".//*[@type='mainImageSources']")
lora-sp4201a5e2023-03-09 16:19:57 +010054 authors = root.findall(".//*[@type='authors']")
lora-spab4e0ea2023-03-10 12:02:24 +010055 #authorURLs = root.findall(".//*[@type='authorURLs']")
56 #categories = root.findall(".//*[@type='category']")
57 #subCategories = root.findall(".//*[@type='subCategory']")
58 #tags = root.findall(".//*[@type='tags']")
lora-sp4201a5e2023-03-09 16:19:57 +010059 datesPublished = root.findall(".//*[@type='datePublished']")
60 timesPublished = root.findall(".//*[@type='timePublished']")
lora-spab4e0ea2023-03-10 12:02:24 +010061 #datesModified = root.findall(".//*[@type='dateModified']")
62 #timesModified = root.findall(".//*[@type='timeModified']")
63 #mainImageWidths = root.findall(".//*[@type='mainImageWidth']")
64 #mainImageHeights = root.findall(".//*[@type='mainImageHeight']")
65 #mainImageThumbnailURLs = root.findall(".//*[@type='mainImageThumbnailURL']")
lora-sp4201a5e2023-03-09 16:19:57 +010066 texts = []
lora-spab4e0ea2023-03-10 12:02:24 +010067
68 # Count text elements and remove metadata
lora-sp4201a5e2023-03-09 16:19:57 +010069 number_of_texts = 0
70 for text in root.iter("{http://www.tei-c.org/ns/1.0}text"):
71 number_of_texts+=1
72 for body in text:
73 for div1 in body:
74 for div2 in div1:
75 for div3 in div2:
76 if div3.get('type') == "metadata":
77 div2.remove(div3)
lora-sp82511632023-03-09 09:12:17 +010078
lora-sp4201a5e2023-03-09 16:19:57 +010079 texts.append(div2)
lora-spab4e0ea2023-03-10 12:02:24 +010080
81 # Remove all elements from root
lora-sp4201a5e2023-03-09 16:19:57 +010082 for elem in root.findall("*"):
83 root.remove(elem)
lora-spab4e0ea2023-03-10 12:02:24 +010084
85 # Rename root
lora-sp4201a5e2023-03-09 16:19:57 +010086 root.tag = "teiDoc"
lora-spab4e0ea2023-03-10 12:02:24 +010087
88 # Create target structure
lora-sp4201a5e2023-03-09 16:19:57 +010089 for i in range(number_of_texts):
90 tei = ET.SubElement(root, "TEI")
91 teiHeader = ET.SubElement(tei, "teiHeader")
92 fileDesc = ET.SubElement(teiHeader, "fileDesc")
93 titleStmt = ET.SubElement(fileDesc, "titleStmt")
94 textSigle = ET.SubElement(titleStmt, "textSigle")
lora-spab4e0ea2023-03-10 12:02:24 +010095 textSigle.text = "BNC/" + f"{j:03}" + "." + f"{i:05}"
lora-sp4201a5e2023-03-09 16:19:57 +010096 sourceDesc = ET.SubElement(fileDesc, "sourceDesc")
97 analytic = ET.SubElement(sourceDesc, "analytic")
98 htitle = ET.SubElement(analytic, "h.title")
99 htitle.text = titles[i].text
100 hauthor = ET.SubElement(analytic, "h.author")
101 hauthor.text = authors[i].text
102 imprint = ET.SubElement(sourceDesc, "imprint")
103 pubDateYear = ET.SubElement(imprint, "pubDate")
104 pubDateYear.set("type", "year")
105 pubDateYear.text = datesPublished[i].text[0:4]
106 pubDateMonth = ET.SubElement(imprint, "pubDate")
107 pubDateMonth.set("type", "month")
108 pubDateMonth.text = datesPublished[i].text[5:7]
109 pubDateDay = ET.SubElement(imprint, "pubDate")
110 pubDateDay.set("type", "day")
111 pubDateDay.text = datesPublished[i].text[8:10]
112 pubDateTime = ET.SubElement(imprint, "pubDate")
113 pubDateTime.set("type", "time")
114 pubDateTime.text = timesPublished[i].text
115 pubPlace = ET.SubElement(imprint, "pubPlace")
116 ref = ET.SubElement(pubPlace, "ref")
117 ref.set("type", "page_url")
118 ref.set("target", pageURLs[i].text)
119 text = ET.SubElement(tei, "text")
120 body = ET.SubElement(text, "body")
121 for p in texts[i]:
122 body.append(p)
lora-sp82511632023-03-09 09:12:17 +0100123
lora-sp09a58a02023-03-10 16:33:46 +0100124 return tree
lora-sp132c3e52023-03-09 16:32:37 +0100125
lora-sp132c3e52023-03-09 16:32:37 +0100126
lora-sp09a58a02023-03-10 16:33:46 +0100127main()