blob: 5c67bdc991c2ad483d85b123936ef3c50c227eba [file] [log] [blame]
lora-sp2e2a21a2023-03-30 13:54:59 +02001import os, sys
2import xml.etree.ElementTree as ET
3from xml.dom import minidom
Marc Kupietz17269e42023-05-25 10:59:01 +02004import traceback
5import sys
lora-sp2e2a21a2023-03-30 13:54:59 +02006
7def main():
8 # Create corpus structure from string and save into file
9 corpus = "<teiCorpus>\n</teiCorpus>"
10 origRoot = ET.fromstring(corpus)
11 corpusStr = minidom.parseString(ET.tostring(origRoot)).toprettyxml(indent=" ")
12 if not os.path.exists('input'):
13 os.makedirs("input")
14 if not os.path.exists('output'):
15 os.makedirs("output")
16 with open("input/tree_structure.xml", "w") as f:
17 f.write(corpusStr)
18
19 # Parse corpus tree
20 corpusTree = ET.parse("input/tree_structure.xml")
21 corpusRoot = corpusTree.getroot()
22
23 # Process documents and append to corpus tree
24 for j in range(1, len(sys.argv)):
Marc Kupietz17269e42023-05-25 10:59:01 +020025 try:
26 currentTree = convert(j-1, sys.argv[j])
27 currentRoot = currentTree.getroot()
28 corpusRoot.append(currentRoot)
29 except:
30 print("ERROR:"+sys.argv[j])
31 print(traceback.format_exc())
32 print(sys.exc_info()[2])
33 continue
lora-sp2e2a21a2023-03-30 13:54:59 +020034
35 # Indent and save tree
36 ET.indent(corpusTree, " ")
37 corpusTree.write("output/corpus.p5.xml", encoding='utf-8', xml_declaration=True, method='xml', short_empty_elements=True)
38
39
40def convert(j, file):
41 # Parse document tree and get root
42 tree = ET.parse(file)
43 root = tree.getroot()
44 ET.register_namespace("", "http://www.tei-c.org/ns/1.0")
45
46 # Store metadata and texts in lists
47 titles = root.find(".//*{http://www.tei-c.org/ns/1.0}bibl/{http://www.tei-c.org/ns/1.0}title")
48 uris = root.find(".//*{http://www.tei-c.org/ns/1.0}bibl/{http://www.tei-c.org/ns/1.0}pubPlace")
49 authors = root.find(".//*{http://www.tei-c.org/ns/1.0}bibl/{http://www.tei-c.org/ns/1.0}author")
50 dates = root.find(".//*{http://www.tei-c.org/ns/1.0}bibl/{http://www.tei-c.org/ns/1.0}date")
51 texts = root.findall(".//*{http://www.tei-c.org/ns/1.0}body/{http://www.tei-c.org/ns/1.0}div")
52 domains = root.find(".//*[@type]")
53
54 # Count text elements
55 number_of_texts = 0
56 for text in root.iter("{http://www.tei-c.org/ns/1.0}text"):
57 for body in text:
58 for div in body:
59 number_of_texts+=1
60
61 # Remove all elements from root
62 for elem in root.findall("*"):
63 root.remove(elem)
64
65 # Rename root
66 root.tag = "teiDoc"
67
68 # Create target structure
69 for i in range(number_of_texts):
70 tei = ET.SubElement(root, "TEI")
71 teiHeader = ET.SubElement(tei, "teiHeader")
72 fileDesc = ET.SubElement(teiHeader, "fileDesc")
73 titleStmt = ET.SubElement(fileDesc, "titleStmt")
74 textSigle = ET.SubElement(titleStmt, "textSigle")
Marc Kupietz386c8af2023-05-23 17:56:42 +020075 textSigle.text = "NOR/" + f"{j:03}" + "." + f"{i:05}"
lora-sp2e2a21a2023-03-30 13:54:59 +020076 sourceDesc = ET.SubElement(fileDesc, "sourceDesc")
77 analytic = ET.SubElement(sourceDesc, "analytic")
78 title = ET.SubElement(titleStmt, "title")
79 title.text = titles.text
80 hauthor = ET.SubElement(analytic, "h.author")
81 hauthor.text = authors.text
82 imprint = ET.SubElement(sourceDesc, "imprint")
83 pubDateYear = ET.SubElement(imprint, "pubDate")
84 pubDateYear.set("type", "year")
85 pubDateYear.text = dates.text
86 pubDateMonth = ET.SubElement(imprint, "pubDate")
87 pubDateMonth.set("type", "month")
88 #pubDateMonth.text = datesPublished[i].text
89 pubDateDay = ET.SubElement(imprint, "pubDate")
90 pubDateDay.set("type", "day")
91 #pubDateDay.text = datesPublished[i].text
92 pubPlace = ET.SubElement(imprint, "pubPlace")
93 idno = ET.SubElement(pubPlace, "idno")
94 idno.set("type", "URI")
95 idno.text = uris.text
96 domain = ET.SubElement(titleStmt, "domain")
97 domain.text = domains.get("type")
Marc Kupietz1d9fd6b2023-05-23 17:54:20 +020098 splitFName = file.split("/")
99 if (splitFName[-2] in ['Persuasive', 'Blog']):
100 domain.text = splitFName[-2]
Marc Kupietz7f9a4932023-05-25 10:58:15 +0200101 elif(splitFName[-4] != "Originaldaten"):
102 domain.text = splitFName[-4] + ':' + splitFName[-3] + ':' + splitFName[-2]
Marc Kupietz1d9fd6b2023-05-23 17:54:20 +0200103 else:
104 domain.text = splitFName[-3] + ':' + splitFName[-2]
Marc Kupietz7f9a4932023-05-25 10:58:15 +0200105 domain.text = domain.text.replace("_hobbies", "_Hobbies")
lora-sp2e2a21a2023-03-30 13:54:59 +0200106 text = ET.SubElement(tei, "text")
107 text.append(texts[i])
108
109 return tree
110
111
112main()