blob: 9cffd67457be67bc29b4a61e7699bf7ebb3819a2 [file] [log] [blame]
Rameela Yaddehige270f7cc2023-07-14 14:12:08 +02001import os, sys
2import xml.etree.ElementTree as ET
3from xml.dom import minidom
4import traceback
5import sys
6
7def main():
8 # Create corpus structure from string and save into file
9 corpus = "<teiCorpus>\n</teiCorpus>"
10 origRoot = ET.fromstring(corpus)
11 corpusStr = minidom.parseString(ET.tostring(origRoot)).toprettyxml(indent=" ")
12 if not os.path.exists('input'):
13 os.makedirs("input")
14 if not os.path.exists('output'):
15 os.makedirs("output")
16 with open("input/tree_structure.xml", "w") as f:
17 f.write(corpusStr)
18
19 # Parse corpus tree
20 corpusTree = ET.parse("input/tree_structure.xml")
21 corpusRoot = corpusTree.getroot()
22
23 # Process documents and append to corpus tree
24 for j in range(1, len(sys.argv)):
25 try:
26 currentTree = convert(j-1, sys.argv[j])
27 currentRoot = currentTree.getroot()
28 corpusRoot.append(currentRoot)
29 except:
30 print("ERROR:"+sys.argv[j])
31 print(traceback.format_exc())
32 print(sys.exc_info()[2])
33 continue
34
35 # Indent and save tree
36 ET.indent(corpusTree, " ")
Rameela Yaddehige6c0ff872023-07-14 14:46:16 +020037 corpusTree.write("output/GA_corpus.p5.xml", encoding='utf-8', xml_declaration=True, method='xml', short_empty_elements=True)
Rameela Yaddehige270f7cc2023-07-14 14:12:08 +020038
39
40def convert(j, file):
41 # Parse document tree and get root
42 tree = ET.parse(file)
43 root = tree.getroot()
44 ET.register_namespace("", "http://www.tei-c.org/ns/1.0")
45
46 # Store metadata and texts in lists
47 titles = root.find(".//*{http://www.tei-c.org/ns/1.0}bibl/{http://www.tei-c.org/ns/1.0}title")
48 uris = root.find(".//*{http://www.tei-c.org/ns/1.0}bibl/{http://www.tei-c.org/ns/1.0}pubPlace")
Rameela Yaddehige46ebd4a2023-08-09 12:33:21 +020049 authors = root.findall(".//*{http://www.tei-c.org/ns/1.0}bibl/{http://www.tei-c.org/ns/1.0}author")
50
Rameela Yaddehige270f7cc2023-07-14 14:12:08 +020051 dates = root.find(".//*{http://www.tei-c.org/ns/1.0}bibl/{http://www.tei-c.org/ns/1.0}date")
52 texts = root.findall(".//*{http://www.tei-c.org/ns/1.0}body/{http://www.tei-c.org/ns/1.0}div")
53 domains = root.find(".//*[@type]")
54
55 # Count text elements
56 number_of_texts = 0
57 for text in root.iter("{http://www.tei-c.org/ns/1.0}text"):
58 for body in text:
59 for div in body:
60 number_of_texts+=1
61
62 number_of_texts = len(texts)
63 # Remove all elements from root
64 for elem in root.findall("*"):
65 root.remove(elem)
66
67 # Rename root
68 root.tag = "teiDoc"
69
70 # Create target structure
71 for i in range(number_of_texts):
72 tei = ET.SubElement(root, "TEI")
73 teiHeader = ET.SubElement(tei, "teiHeader")
74 fileDesc = ET.SubElement(teiHeader, "fileDesc")
75 titleStmt = ET.SubElement(fileDesc, "titleStmt")
76 textSigle = ET.SubElement(titleStmt, "textSigle")
77 textSigle.text = "GA/" + f"{j:03}" + "." + f"{i:05}"
78 sourceDesc = ET.SubElement(fileDesc, "sourceDesc")
79 analytic = ET.SubElement(sourceDesc, "analytic")
80 title = ET.SubElement(titleStmt, "title")
81 title.text = titles.text
Rameela Yaddehige46ebd4a2023-08-09 12:33:21 +020082
Rameela Yaddehige17a11912023-08-09 12:44:38 +020083 #print(textSigle.text, end='\n')
Rameela Yaddehige46ebd4a2023-08-09 12:33:21 +020084
Rameela Yaddehige17a11912023-08-09 12:44:38 +020085 #adding changes to detect multiple authors
Rameela Yaddehige46ebd4a2023-08-09 12:33:21 +020086 for author in authors :
87 hauthor = ET.SubElement(analytic, "h.author")
88 hauthor.text = author.text
Rameela Yaddehige17a11912023-08-09 12:44:38 +020089 #print(hauthor.text, end='\n')
Rameela Yaddehige270f7cc2023-07-14 14:12:08 +020090 imprint = ET.SubElement(sourceDesc, "imprint")
91 pubDateYear = ET.SubElement(imprint, "pubDate")
92 pubDateYear.set("type", "year")
93 pubDateYear.text = dates.text
94 pubDateMonth = ET.SubElement(imprint, "pubDate")
95 pubDateMonth.set("type", "month")
96 #pubDateMonth.text = datesPublished[i].text
97 pubDateDay = ET.SubElement(imprint, "pubDate")
98 pubDateDay.set("type", "day")
99 #pubDateDay.text = datesPublished[i].text
100 pubPlace = ET.SubElement(imprint, "pubPlace")
101 idno = ET.SubElement(pubPlace, "idno")
102 idno.set("type", "URI")
103 idno.text = uris.text
104 domain = ET.SubElement(titleStmt, "domain")
105 domain.text = domains.get("type")
Rameela Yaddehige2a20eac2023-07-14 17:10:32 +0200106 '''
Rameela Yaddehige270f7cc2023-07-14 14:12:08 +0200107 splitFName = file.split("/")
108 if (splitFName[-2] in ['Persuasive', 'Blog']):
109 domain.text = splitFName[-2]
110 elif(splitFName[-4] != "Originaldaten"):
111 domain.text = splitFName[-4] + ':' + splitFName[-3] + ':' + splitFName[-2]
112 else:
113 domain.text = splitFName[-3] + ':' + splitFName[-2]
Rameela Yaddehige2a20eac2023-07-14 17:10:32 +0200114 '''
Rameela Yaddehige270f7cc2023-07-14 14:12:08 +0200115 domain.text = domain.text.replace("_hobbies", "_Hobbies")
116 text = ET.SubElement(tei, "text")
Rameela Yaddehige46ebd4a2023-08-09 12:33:21 +0200117 text.append(texts[i])
118
Rameela Yaddehige270f7cc2023-07-14 14:12:08 +0200119
120 return tree
121
122
123main()