blob: 6efd6df4b3f8741edc12163f9ab9c8c1bc985792 [file] [log] [blame]
Rameela Yaddehige270f7cc2023-07-14 14:12:08 +02001import os, sys
2import xml.etree.ElementTree as ET
3from xml.dom import minidom
4import traceback
5import sys
6
7def main():
8 # Create corpus structure from string and save into file
9 corpus = "<teiCorpus>\n</teiCorpus>"
10 origRoot = ET.fromstring(corpus)
11 corpusStr = minidom.parseString(ET.tostring(origRoot)).toprettyxml(indent=" ")
12 if not os.path.exists('input'):
13 os.makedirs("input")
14 if not os.path.exists('output'):
15 os.makedirs("output")
16 with open("input/tree_structure.xml", "w") as f:
17 f.write(corpusStr)
18
19 # Parse corpus tree
20 corpusTree = ET.parse("input/tree_structure.xml")
21 corpusRoot = corpusTree.getroot()
22
23 # Process documents and append to corpus tree
24 for j in range(1, len(sys.argv)):
25 try:
26 currentTree = convert(j-1, sys.argv[j])
27 currentRoot = currentTree.getroot()
28 corpusRoot.append(currentRoot)
29 except:
30 print("ERROR:"+sys.argv[j])
31 print(traceback.format_exc())
32 print(sys.exc_info()[2])
33 continue
34
35 # Indent and save tree
36 ET.indent(corpusTree, " ")
37 corpusTree.write("output/corpus.p5.xml", encoding='utf-8', xml_declaration=True, method='xml', short_empty_elements=True)
38
39
40def convert(j, file):
41 # Parse document tree and get root
42 tree = ET.parse(file)
43 root = tree.getroot()
44 ET.register_namespace("", "http://www.tei-c.org/ns/1.0")
45
46 # Store metadata and texts in lists
47 titles = root.find(".//*{http://www.tei-c.org/ns/1.0}bibl/{http://www.tei-c.org/ns/1.0}title")
48 uris = root.find(".//*{http://www.tei-c.org/ns/1.0}bibl/{http://www.tei-c.org/ns/1.0}pubPlace")
49 authors = root.find(".//*{http://www.tei-c.org/ns/1.0}bibl/{http://www.tei-c.org/ns/1.0}author")
50 dates = root.find(".//*{http://www.tei-c.org/ns/1.0}bibl/{http://www.tei-c.org/ns/1.0}date")
51 texts = root.findall(".//*{http://www.tei-c.org/ns/1.0}body/{http://www.tei-c.org/ns/1.0}div")
52 domains = root.find(".//*[@type]")
53
54 # Count text elements
55 number_of_texts = 0
56 for text in root.iter("{http://www.tei-c.org/ns/1.0}text"):
57 for body in text:
58 for div in body:
59 number_of_texts+=1
60
61 number_of_texts = len(texts)
62 # Remove all elements from root
63 for elem in root.findall("*"):
64 root.remove(elem)
65
66 # Rename root
67 root.tag = "teiDoc"
68
69 # Create target structure
70 for i in range(number_of_texts):
71 tei = ET.SubElement(root, "TEI")
72 teiHeader = ET.SubElement(tei, "teiHeader")
73 fileDesc = ET.SubElement(teiHeader, "fileDesc")
74 titleStmt = ET.SubElement(fileDesc, "titleStmt")
75 textSigle = ET.SubElement(titleStmt, "textSigle")
76 textSigle.text = "GA/" + f"{j:03}" + "." + f"{i:05}"
77 sourceDesc = ET.SubElement(fileDesc, "sourceDesc")
78 analytic = ET.SubElement(sourceDesc, "analytic")
79 title = ET.SubElement(titleStmt, "title")
80 title.text = titles.text
81 hauthor = ET.SubElement(analytic, "h.author")
82 hauthor.text = authors.text
83 imprint = ET.SubElement(sourceDesc, "imprint")
84 pubDateYear = ET.SubElement(imprint, "pubDate")
85 pubDateYear.set("type", "year")
86 pubDateYear.text = dates.text
87 pubDateMonth = ET.SubElement(imprint, "pubDate")
88 pubDateMonth.set("type", "month")
89 #pubDateMonth.text = datesPublished[i].text
90 pubDateDay = ET.SubElement(imprint, "pubDate")
91 pubDateDay.set("type", "day")
92 #pubDateDay.text = datesPublished[i].text
93 pubPlace = ET.SubElement(imprint, "pubPlace")
94 idno = ET.SubElement(pubPlace, "idno")
95 idno.set("type", "URI")
96 idno.text = uris.text
97 domain = ET.SubElement(titleStmt, "domain")
98 domain.text = domains.get("type")
99 splitFName = file.split("/")
100 if (splitFName[-2] in ['Persuasive', 'Blog']):
101 domain.text = splitFName[-2]
102 elif(splitFName[-4] != "Originaldaten"):
103 domain.text = splitFName[-4] + ':' + splitFName[-3] + ':' + splitFName[-2]
104 else:
105 domain.text = splitFName[-3] + ':' + splitFName[-2]
106 domain.text = domain.text.replace("_hobbies", "_Hobbies")
107 text = ET.SubElement(tei, "text")
108 text.append(texts[i])
109
110 return tree
111
112
113main()