blob: cd468a648238bd9b2b068d0686a7e2436a52f3e2 [file] [log] [blame]
Rameela Yaddehige270f7cc2023-07-14 14:12:08 +02001import os, sys
2import xml.etree.ElementTree as ET
3from xml.dom import minidom
4import traceback
5import sys
6
7def main():
8 # Create corpus structure from string and save into file
9 corpus = "<teiCorpus>\n</teiCorpus>"
10 origRoot = ET.fromstring(corpus)
11 corpusStr = minidom.parseString(ET.tostring(origRoot)).toprettyxml(indent=" ")
12 if not os.path.exists('input'):
13 os.makedirs("input")
14 if not os.path.exists('output'):
15 os.makedirs("output")
16 with open("input/tree_structure.xml", "w") as f:
17 f.write(corpusStr)
18
19 # Parse corpus tree
20 corpusTree = ET.parse("input/tree_structure.xml")
21 corpusRoot = corpusTree.getroot()
22
23 # Process documents and append to corpus tree
24 for j in range(1, len(sys.argv)):
25 try:
26 currentTree = convert(j-1, sys.argv[j])
27 currentRoot = currentTree.getroot()
28 corpusRoot.append(currentRoot)
29 except:
30 print("ERROR:"+sys.argv[j])
31 print(traceback.format_exc())
32 print(sys.exc_info()[2])
33 continue
34
35 # Indent and save tree
36 ET.indent(corpusTree, " ")
Rameela Yaddehige6c0ff872023-07-14 14:46:16 +020037 corpusTree.write("output/GA_corpus.p5.xml", encoding='utf-8', xml_declaration=True, method='xml', short_empty_elements=True)
Rameela Yaddehige270f7cc2023-07-14 14:12:08 +020038
39
40def convert(j, file):
41 # Parse document tree and get root
42 tree = ET.parse(file)
43 root = tree.getroot()
44 ET.register_namespace("", "http://www.tei-c.org/ns/1.0")
45
46 # Store metadata and texts in lists
47 titles = root.find(".//*{http://www.tei-c.org/ns/1.0}bibl/{http://www.tei-c.org/ns/1.0}title")
48 uris = root.find(".//*{http://www.tei-c.org/ns/1.0}bibl/{http://www.tei-c.org/ns/1.0}pubPlace")
Rameela Yaddehige46ebd4a2023-08-09 12:33:21 +020049 authors = root.findall(".//*{http://www.tei-c.org/ns/1.0}bibl/{http://www.tei-c.org/ns/1.0}author")
Rameela Yaddehige2f996c22023-08-09 15:04:27 +020050 txtPublisher = root.find(".//*{http://www.tei-c.org/ns/1.0}bibl/{http://www.tei-c.org/ns/1.0}publisher")
51 licenses = root.findall(".//*{http://www.tei-c.org/ns/1.0}availability/{http://www.tei-c.org/ns/1.0}licence")
Rameela Yaddehige46ebd4a2023-08-09 12:33:21 +020052
Rameela Yaddehige270f7cc2023-07-14 14:12:08 +020053 dates = root.find(".//*{http://www.tei-c.org/ns/1.0}bibl/{http://www.tei-c.org/ns/1.0}date")
54 texts = root.findall(".//*{http://www.tei-c.org/ns/1.0}body/{http://www.tei-c.org/ns/1.0}div")
55 domains = root.find(".//*[@type]")
56
57 # Count text elements
58 number_of_texts = 0
59 for text in root.iter("{http://www.tei-c.org/ns/1.0}text"):
60 for body in text:
61 for div in body:
62 number_of_texts+=1
63
64 number_of_texts = len(texts)
65 # Remove all elements from root
66 for elem in root.findall("*"):
67 root.remove(elem)
68
69 # Rename root
70 root.tag = "teiDoc"
71
72 # Create target structure
73 for i in range(number_of_texts):
74 tei = ET.SubElement(root, "TEI")
75 teiHeader = ET.SubElement(tei, "teiHeader")
76 fileDesc = ET.SubElement(teiHeader, "fileDesc")
77 titleStmt = ET.SubElement(fileDesc, "titleStmt")
78 textSigle = ET.SubElement(titleStmt, "textSigle")
79 textSigle.text = "GA/" + f"{j:03}" + "." + f"{i:05}"
80 sourceDesc = ET.SubElement(fileDesc, "sourceDesc")
81 analytic = ET.SubElement(sourceDesc, "analytic")
82 title = ET.SubElement(titleStmt, "title")
83 title.text = titles.text
Rameela Yaddehige46ebd4a2023-08-09 12:33:21 +020084
Rameela Yaddehige2f996c22023-08-09 15:04:27 +020085 print(textSigle.text, end='\n')
Rameela Yaddehige46ebd4a2023-08-09 12:33:21 +020086
Rameela Yaddehige17a11912023-08-09 12:44:38 +020087 #adding changes to detect multiple authors
Rameela Yaddehige46ebd4a2023-08-09 12:33:21 +020088 for author in authors :
89 hauthor = ET.SubElement(analytic, "h.author")
90 hauthor.text = author.text
Rameela Yaddehige17a11912023-08-09 12:44:38 +020091 #print(hauthor.text, end='\n')
Rameela Yaddehige270f7cc2023-07-14 14:12:08 +020092 imprint = ET.SubElement(sourceDesc, "imprint")
Rameela Yaddehige2f996c22023-08-09 15:04:27 +020093
94 #adding publisher
95
96 print('Publisher : ',txtPublisher.text, end='\n')
97 if txtPublisher is None:
98 print('Publisher : ',txtPublisher.text, end='\n')
99 else :
100 publisher = ET.SubElement(imprint, "publisher")
101 publisher.text = txtPublisher.text
102
103 #adding licenses
104 for license in licenses:
105 print('License : ',license.text,end='\n')
106
Rameela Yaddehige270f7cc2023-07-14 14:12:08 +0200107 pubDateYear = ET.SubElement(imprint, "pubDate")
108 pubDateYear.set("type", "year")
109 pubDateYear.text = dates.text
110 pubDateMonth = ET.SubElement(imprint, "pubDate")
111 pubDateMonth.set("type", "month")
112 #pubDateMonth.text = datesPublished[i].text
113 pubDateDay = ET.SubElement(imprint, "pubDate")
114 pubDateDay.set("type", "day")
115 #pubDateDay.text = datesPublished[i].text
116 pubPlace = ET.SubElement(imprint, "pubPlace")
117 idno = ET.SubElement(pubPlace, "idno")
118 idno.set("type", "URI")
119 idno.text = uris.text
120 domain = ET.SubElement(titleStmt, "domain")
121 domain.text = domains.get("type")
Rameela Yaddehige2a20eac2023-07-14 17:10:32 +0200122 '''
Rameela Yaddehige270f7cc2023-07-14 14:12:08 +0200123 splitFName = file.split("/")
124 if (splitFName[-2] in ['Persuasive', 'Blog']):
125 domain.text = splitFName[-2]
126 elif(splitFName[-4] != "Originaldaten"):
127 domain.text = splitFName[-4] + ':' + splitFName[-3] + ':' + splitFName[-2]
128 else:
129 domain.text = splitFName[-3] + ':' + splitFName[-2]
Rameela Yaddehige2a20eac2023-07-14 17:10:32 +0200130 '''
Rameela Yaddehige270f7cc2023-07-14 14:12:08 +0200131 domain.text = domain.text.replace("_hobbies", "_Hobbies")
132 text = ET.SubElement(tei, "text")
Rameela Yaddehige46ebd4a2023-08-09 12:33:21 +0200133 text.append(texts[i])
134
Rameela Yaddehige270f7cc2023-07-14 14:12:08 +0200135
136 return tree
137
138
139main()