blob: be20615f9ba5888503ad3fee04df7cf5859b8856 [file] [log] [blame]
lora-sp2e2a21a2023-03-30 13:54:59 +02001import os, sys
2import xml.etree.ElementTree as ET
3from xml.dom import minidom
4
5
6def main():
7 # Create corpus structure from string and save into file
8 corpus = "<teiCorpus>\n</teiCorpus>"
9 origRoot = ET.fromstring(corpus)
10 corpusStr = minidom.parseString(ET.tostring(origRoot)).toprettyxml(indent=" ")
11 if not os.path.exists('input'):
12 os.makedirs("input")
13 if not os.path.exists('output'):
14 os.makedirs("output")
15 with open("input/tree_structure.xml", "w") as f:
16 f.write(corpusStr)
17
18 # Parse corpus tree
19 corpusTree = ET.parse("input/tree_structure.xml")
20 corpusRoot = corpusTree.getroot()
21
22 # Process documents and append to corpus tree
23 for j in range(1, len(sys.argv)):
24 try:
25 currentTree = convert(j-1, sys.argv[j])
26 currentRoot = currentTree.getroot()
27 corpusRoot.append(currentRoot)
28 except:
29 print(sys.argv[j])
30 continue
31
32 # Indent and save tree
33 ET.indent(corpusTree, " ")
34 corpusTree.write("output/corpus.p5.xml", encoding='utf-8', xml_declaration=True, method='xml', short_empty_elements=True)
35
36
37def convert(j, file):
38 # Parse document tree and get root
39 tree = ET.parse(file)
40 root = tree.getroot()
41 ET.register_namespace("", "http://www.tei-c.org/ns/1.0")
42
43 # Store metadata and texts in lists
44 titles = root.find(".//*{http://www.tei-c.org/ns/1.0}bibl/{http://www.tei-c.org/ns/1.0}title")
45 uris = root.find(".//*{http://www.tei-c.org/ns/1.0}bibl/{http://www.tei-c.org/ns/1.0}pubPlace")
46 authors = root.find(".//*{http://www.tei-c.org/ns/1.0}bibl/{http://www.tei-c.org/ns/1.0}author")
47 dates = root.find(".//*{http://www.tei-c.org/ns/1.0}bibl/{http://www.tei-c.org/ns/1.0}date")
48 texts = root.findall(".//*{http://www.tei-c.org/ns/1.0}body/{http://www.tei-c.org/ns/1.0}div")
49 domains = root.find(".//*[@type]")
50
51 # Count text elements
52 number_of_texts = 0
53 for text in root.iter("{http://www.tei-c.org/ns/1.0}text"):
54 for body in text:
55 for div in body:
56 number_of_texts+=1
57
58 # Remove all elements from root
59 for elem in root.findall("*"):
60 root.remove(elem)
61
62 # Rename root
63 root.tag = "teiDoc"
64
65 # Create target structure
66 for i in range(number_of_texts):
67 tei = ET.SubElement(root, "TEI")
68 teiHeader = ET.SubElement(tei, "teiHeader")
69 fileDesc = ET.SubElement(teiHeader, "fileDesc")
70 titleStmt = ET.SubElement(fileDesc, "titleStmt")
71 textSigle = ET.SubElement(titleStmt, "textSigle")
Marc Kupietz5fe47102023-05-23 16:57:22 +020072 textSigle.text = "EN/" + f"{j:03}" + "." + f"{i:05}"
lora-sp2e2a21a2023-03-30 13:54:59 +020073 sourceDesc = ET.SubElement(fileDesc, "sourceDesc")
74 analytic = ET.SubElement(sourceDesc, "analytic")
75 title = ET.SubElement(titleStmt, "title")
76 title.text = titles.text
77 hauthor = ET.SubElement(analytic, "h.author")
78 hauthor.text = authors.text
79 imprint = ET.SubElement(sourceDesc, "imprint")
80 pubDateYear = ET.SubElement(imprint, "pubDate")
81 pubDateYear.set("type", "year")
82 pubDateYear.text = dates.text
83 pubDateMonth = ET.SubElement(imprint, "pubDate")
84 pubDateMonth.set("type", "month")
85 #pubDateMonth.text = datesPublished[i].text
86 pubDateDay = ET.SubElement(imprint, "pubDate")
87 pubDateDay.set("type", "day")
88 #pubDateDay.text = datesPublished[i].text
89 pubPlace = ET.SubElement(imprint, "pubPlace")
90 idno = ET.SubElement(pubPlace, "idno")
91 idno.set("type", "URI")
92 idno.text = uris.text
93 domain = ET.SubElement(titleStmt, "domain")
94 domain.text = domains.get("type")
95 text = ET.SubElement(tei, "text")
96 text.append(texts[i])
97
98 return tree
99
100
101main()