blob: 693ed4b36b073b4a395d5026a64ac1569ae1692e [file] [log] [blame]
lora-sp2e2a21a2023-03-30 13:54:59 +02001import os, sys
2import xml.etree.ElementTree as ET
3from xml.dom import minidom
4
5
6def main():
7 # Create corpus structure from string and save into file
8 corpus = "<teiCorpus>\n</teiCorpus>"
9 origRoot = ET.fromstring(corpus)
10 corpusStr = minidom.parseString(ET.tostring(origRoot)).toprettyxml(indent=" ")
11 if not os.path.exists('input'):
12 os.makedirs("input")
13 if not os.path.exists('output'):
14 os.makedirs("output")
15 with open("input/tree_structure.xml", "w") as f:
16 f.write(corpusStr)
17
18 # Parse corpus tree
19 corpusTree = ET.parse("input/tree_structure.xml")
20 corpusRoot = corpusTree.getroot()
21
22 # Process documents and append to corpus tree
23 for j in range(1, len(sys.argv)):
24 try:
25 currentTree = convert(j-1, sys.argv[j])
26 currentRoot = currentTree.getroot()
27 corpusRoot.append(currentRoot)
28 except:
29 print(sys.argv[j])
30 continue
31
32 # Indent and save tree
33 ET.indent(corpusTree, " ")
34 corpusTree.write("output/corpus.p5.xml", encoding='utf-8', xml_declaration=True, method='xml', short_empty_elements=True)
35
36
37def convert(j, file):
38 # Parse document tree and get root
39 tree = ET.parse(file)
40 root = tree.getroot()
41 ET.register_namespace("", "http://www.tei-c.org/ns/1.0")
42
43 # Store metadata and texts in lists
44 titles = root.find(".//*{http://www.tei-c.org/ns/1.0}bibl/{http://www.tei-c.org/ns/1.0}title")
45 uris = root.find(".//*{http://www.tei-c.org/ns/1.0}bibl/{http://www.tei-c.org/ns/1.0}pubPlace")
46 authors = root.find(".//*{http://www.tei-c.org/ns/1.0}bibl/{http://www.tei-c.org/ns/1.0}author")
47 dates = root.find(".//*{http://www.tei-c.org/ns/1.0}bibl/{http://www.tei-c.org/ns/1.0}date")
48 texts = root.findall(".//*{http://www.tei-c.org/ns/1.0}body/{http://www.tei-c.org/ns/1.0}div")
49 domains = root.find(".//*[@type]")
50
51 # Count text elements
52 number_of_texts = 0
53 for text in root.iter("{http://www.tei-c.org/ns/1.0}text"):
54 for body in text:
55 for div in body:
56 number_of_texts+=1
57
58 # Remove all elements from root
59 for elem in root.findall("*"):
60 root.remove(elem)
61
62 # Rename root
63 root.tag = "teiDoc"
64
65 # Create target structure
66 for i in range(number_of_texts):
67 tei = ET.SubElement(root, "TEI")
68 teiHeader = ET.SubElement(tei, "teiHeader")
69 fileDesc = ET.SubElement(teiHeader, "fileDesc")
70 titleStmt = ET.SubElement(fileDesc, "titleStmt")
71 textSigle = ET.SubElement(titleStmt, "textSigle")
Marc Kupietz386c8af2023-05-23 17:56:42 +020072 textSigle.text = "NOR/" + f"{j:03}" + "." + f"{i:05}"
lora-sp2e2a21a2023-03-30 13:54:59 +020073 sourceDesc = ET.SubElement(fileDesc, "sourceDesc")
74 analytic = ET.SubElement(sourceDesc, "analytic")
75 title = ET.SubElement(titleStmt, "title")
76 title.text = titles.text
77 hauthor = ET.SubElement(analytic, "h.author")
78 hauthor.text = authors.text
79 imprint = ET.SubElement(sourceDesc, "imprint")
80 pubDateYear = ET.SubElement(imprint, "pubDate")
81 pubDateYear.set("type", "year")
82 pubDateYear.text = dates.text
83 pubDateMonth = ET.SubElement(imprint, "pubDate")
84 pubDateMonth.set("type", "month")
85 #pubDateMonth.text = datesPublished[i].text
86 pubDateDay = ET.SubElement(imprint, "pubDate")
87 pubDateDay.set("type", "day")
88 #pubDateDay.text = datesPublished[i].text
89 pubPlace = ET.SubElement(imprint, "pubPlace")
90 idno = ET.SubElement(pubPlace, "idno")
91 idno.set("type", "URI")
92 idno.text = uris.text
93 domain = ET.SubElement(titleStmt, "domain")
94 domain.text = domains.get("type")
Marc Kupietz1d9fd6b2023-05-23 17:54:20 +020095 splitFName = file.split("/")
96 if (splitFName[-2] in ['Persuasive', 'Blog']):
97 domain.text = splitFName[-2]
Marc Kupietz7f9a4932023-05-25 10:58:15 +020098 elif(splitFName[-4] != "Originaldaten"):
99 domain.text = splitFName[-4] + ':' + splitFName[-3] + ':' + splitFName[-2]
Marc Kupietz1d9fd6b2023-05-23 17:54:20 +0200100 else:
101 domain.text = splitFName[-3] + ':' + splitFName[-2]
Marc Kupietz7f9a4932023-05-25 10:58:15 +0200102 domain.text = domain.text.replace("_hobbies", "_Hobbies")
lora-sp2e2a21a2023-03-30 13:54:59 +0200103 text = ET.SubElement(tei, "text")
104 text.append(texts[i])
105
106 return tree
107
108
109main()