blob: 2a28da53b91db340e0ab1d2746f136bc47de2bc1 [file] [log] [blame]
lora-sp2e2a21a2023-03-30 13:54:59 +02001import os, sys
2import xml.etree.ElementTree as ET
3from xml.dom import minidom
Marc Kupietz17269e42023-05-25 10:59:01 +02004import traceback
5import sys
lora-sp2e2a21a2023-03-30 13:54:59 +02006
7def main():
8 # Create corpus structure from string and save into file
9 corpus = "<teiCorpus>\n</teiCorpus>"
10 origRoot = ET.fromstring(corpus)
11 corpusStr = minidom.parseString(ET.tostring(origRoot)).toprettyxml(indent=" ")
12 if not os.path.exists('input'):
13 os.makedirs("input")
14 if not os.path.exists('output'):
15 os.makedirs("output")
16 with open("input/tree_structure.xml", "w") as f:
17 f.write(corpusStr)
18
19 # Parse corpus tree
20 corpusTree = ET.parse("input/tree_structure.xml")
21 corpusRoot = corpusTree.getroot()
22
23 # Process documents and append to corpus tree
24 for j in range(1, len(sys.argv)):
Marc Kupietz17269e42023-05-25 10:59:01 +020025 try:
26 currentTree = convert(j-1, sys.argv[j])
27 currentRoot = currentTree.getroot()
28 corpusRoot.append(currentRoot)
29 except:
30 print("ERROR:"+sys.argv[j])
31 print(traceback.format_exc())
32 print(sys.exc_info()[2])
33 continue
lora-sp2e2a21a2023-03-30 13:54:59 +020034
35 # Indent and save tree
36 ET.indent(corpusTree, " ")
37 corpusTree.write("output/corpus.p5.xml", encoding='utf-8', xml_declaration=True, method='xml', short_empty_elements=True)
38
39
40def convert(j, file):
41 # Parse document tree and get root
42 tree = ET.parse(file)
43 root = tree.getroot()
44 ET.register_namespace("", "http://www.tei-c.org/ns/1.0")
45
46 # Store metadata and texts in lists
47 titles = root.find(".//*{http://www.tei-c.org/ns/1.0}bibl/{http://www.tei-c.org/ns/1.0}title")
48 uris = root.find(".//*{http://www.tei-c.org/ns/1.0}bibl/{http://www.tei-c.org/ns/1.0}pubPlace")
49 authors = root.find(".//*{http://www.tei-c.org/ns/1.0}bibl/{http://www.tei-c.org/ns/1.0}author")
50 dates = root.find(".//*{http://www.tei-c.org/ns/1.0}bibl/{http://www.tei-c.org/ns/1.0}date")
Marc Kupietz13367dd2023-05-25 11:00:33 +020051 texts = root.findall(".//*{http://www.tei-c.org/ns/1.0}body/{http://www.tei-c.org/ns/1.0}div")
lora-sp2e2a21a2023-03-30 13:54:59 +020052 domains = root.find(".//*[@type]")
53
54 # Count text elements
55 number_of_texts = 0
56 for text in root.iter("{http://www.tei-c.org/ns/1.0}text"):
57 for body in text:
58 for div in body:
59 number_of_texts+=1
60
Marc Kupietz13367dd2023-05-25 11:00:33 +020061 number_of_texts = len(texts)
lora-sp2e2a21a2023-03-30 13:54:59 +020062 # Remove all elements from root
63 for elem in root.findall("*"):
64 root.remove(elem)
65
66 # Rename root
67 root.tag = "teiDoc"
68
69 # Create target structure
70 for i in range(number_of_texts):
71 tei = ET.SubElement(root, "TEI")
72 teiHeader = ET.SubElement(tei, "teiHeader")
73 fileDesc = ET.SubElement(teiHeader, "fileDesc")
74 titleStmt = ET.SubElement(fileDesc, "titleStmt")
75 textSigle = ET.SubElement(titleStmt, "textSigle")
Marc Kupietz386c8af2023-05-23 17:56:42 +020076 textSigle.text = "NOR/" + f"{j:03}" + "." + f"{i:05}"
lora-sp2e2a21a2023-03-30 13:54:59 +020077 sourceDesc = ET.SubElement(fileDesc, "sourceDesc")
78 analytic = ET.SubElement(sourceDesc, "analytic")
79 title = ET.SubElement(titleStmt, "title")
80 title.text = titles.text
81 hauthor = ET.SubElement(analytic, "h.author")
82 hauthor.text = authors.text
83 imprint = ET.SubElement(sourceDesc, "imprint")
84 pubDateYear = ET.SubElement(imprint, "pubDate")
85 pubDateYear.set("type", "year")
86 pubDateYear.text = dates.text
87 pubDateMonth = ET.SubElement(imprint, "pubDate")
88 pubDateMonth.set("type", "month")
89 #pubDateMonth.text = datesPublished[i].text
90 pubDateDay = ET.SubElement(imprint, "pubDate")
91 pubDateDay.set("type", "day")
92 #pubDateDay.text = datesPublished[i].text
93 pubPlace = ET.SubElement(imprint, "pubPlace")
94 idno = ET.SubElement(pubPlace, "idno")
95 idno.set("type", "URI")
96 idno.text = uris.text
97 domain = ET.SubElement(titleStmt, "domain")
98 domain.text = domains.get("type")
Marc Kupietz1d9fd6b2023-05-23 17:54:20 +020099 splitFName = file.split("/")
100 if (splitFName[-2] in ['Persuasive', 'Blog']):
101 domain.text = splitFName[-2]
Marc Kupietz7f9a4932023-05-25 10:58:15 +0200102 elif(splitFName[-4] != "Originaldaten"):
103 domain.text = splitFName[-4] + ':' + splitFName[-3] + ':' + splitFName[-2]
Marc Kupietz1d9fd6b2023-05-23 17:54:20 +0200104 else:
105 domain.text = splitFName[-3] + ':' + splitFName[-2]
Marc Kupietz7f9a4932023-05-25 10:58:15 +0200106 domain.text = domain.text.replace("_hobbies", "_Hobbies")
lora-sp2e2a21a2023-03-30 13:54:59 +0200107 text = ET.SubElement(tei, "text")
108 text.append(texts[i])
109
110 return tree
111
112
113main()