Rameela Yaddehige | 270f7cc | 2023-07-14 14:12:08 +0200 | [diff] [blame] | 1 | import os, sys |
| 2 | import xml.etree.ElementTree as ET |
| 3 | from xml.dom import minidom |
| 4 | import traceback |
| 5 | import sys |
| 6 | |
| 7 | def main(): |
| 8 | # Create corpus structure from string and save into file |
| 9 | corpus = "<teiCorpus>\n</teiCorpus>" |
| 10 | origRoot = ET.fromstring(corpus) |
| 11 | corpusStr = minidom.parseString(ET.tostring(origRoot)).toprettyxml(indent=" ") |
| 12 | if not os.path.exists('input'): |
| 13 | os.makedirs("input") |
| 14 | if not os.path.exists('output'): |
| 15 | os.makedirs("output") |
| 16 | with open("input/tree_structure.xml", "w") as f: |
| 17 | f.write(corpusStr) |
| 18 | |
| 19 | # Parse corpus tree |
| 20 | corpusTree = ET.parse("input/tree_structure.xml") |
| 21 | corpusRoot = corpusTree.getroot() |
| 22 | |
| 23 | # Process documents and append to corpus tree |
| 24 | for j in range(1, len(sys.argv)): |
| 25 | try: |
| 26 | currentTree = convert(j-1, sys.argv[j]) |
| 27 | currentRoot = currentTree.getroot() |
| 28 | corpusRoot.append(currentRoot) |
| 29 | except: |
| 30 | print("ERROR:"+sys.argv[j]) |
| 31 | print(traceback.format_exc()) |
| 32 | print(sys.exc_info()[2]) |
| 33 | continue |
| 34 | |
| 35 | # Indent and save tree |
| 36 | ET.indent(corpusTree, " ") |
Rameela Yaddehige | 6c0ff87 | 2023-07-14 14:46:16 +0200 | [diff] [blame] | 37 | corpusTree.write("output/GA_corpus.p5.xml", encoding='utf-8', xml_declaration=True, method='xml', short_empty_elements=True) |
Rameela Yaddehige | 270f7cc | 2023-07-14 14:12:08 +0200 | [diff] [blame] | 38 | |
| 39 | |
| 40 | def convert(j, file): |
| 41 | # Parse document tree and get root |
| 42 | tree = ET.parse(file) |
| 43 | root = tree.getroot() |
| 44 | ET.register_namespace("", "http://www.tei-c.org/ns/1.0") |
| 45 | |
| 46 | # Store metadata and texts in lists |
| 47 | titles = root.find(".//*{http://www.tei-c.org/ns/1.0}bibl/{http://www.tei-c.org/ns/1.0}title") |
| 48 | uris = root.find(".//*{http://www.tei-c.org/ns/1.0}bibl/{http://www.tei-c.org/ns/1.0}pubPlace") |
Rameela Yaddehige | 46ebd4a | 2023-08-09 12:33:21 +0200 | [diff] [blame] | 49 | authors = root.findall(".//*{http://www.tei-c.org/ns/1.0}bibl/{http://www.tei-c.org/ns/1.0}author") |
Rameela Yaddehige | 2f996c2 | 2023-08-09 15:04:27 +0200 | [diff] [blame^] | 50 | txtPublisher = root.find(".//*{http://www.tei-c.org/ns/1.0}bibl/{http://www.tei-c.org/ns/1.0}publisher") |
| 51 | licenses = root.findall(".//*{http://www.tei-c.org/ns/1.0}availability/{http://www.tei-c.org/ns/1.0}licence") |
Rameela Yaddehige | 46ebd4a | 2023-08-09 12:33:21 +0200 | [diff] [blame] | 52 | |
Rameela Yaddehige | 270f7cc | 2023-07-14 14:12:08 +0200 | [diff] [blame] | 53 | dates = root.find(".//*{http://www.tei-c.org/ns/1.0}bibl/{http://www.tei-c.org/ns/1.0}date") |
| 54 | texts = root.findall(".//*{http://www.tei-c.org/ns/1.0}body/{http://www.tei-c.org/ns/1.0}div") |
| 55 | domains = root.find(".//*[@type]") |
| 56 | |
| 57 | # Count text elements |
| 58 | number_of_texts = 0 |
| 59 | for text in root.iter("{http://www.tei-c.org/ns/1.0}text"): |
| 60 | for body in text: |
| 61 | for div in body: |
| 62 | number_of_texts+=1 |
| 63 | |
| 64 | number_of_texts = len(texts) |
| 65 | # Remove all elements from root |
| 66 | for elem in root.findall("*"): |
| 67 | root.remove(elem) |
| 68 | |
| 69 | # Rename root |
| 70 | root.tag = "teiDoc" |
| 71 | |
| 72 | # Create target structure |
| 73 | for i in range(number_of_texts): |
| 74 | tei = ET.SubElement(root, "TEI") |
| 75 | teiHeader = ET.SubElement(tei, "teiHeader") |
| 76 | fileDesc = ET.SubElement(teiHeader, "fileDesc") |
| 77 | titleStmt = ET.SubElement(fileDesc, "titleStmt") |
| 78 | textSigle = ET.SubElement(titleStmt, "textSigle") |
| 79 | textSigle.text = "GA/" + f"{j:03}" + "." + f"{i:05}" |
| 80 | sourceDesc = ET.SubElement(fileDesc, "sourceDesc") |
| 81 | analytic = ET.SubElement(sourceDesc, "analytic") |
| 82 | title = ET.SubElement(titleStmt, "title") |
| 83 | title.text = titles.text |
Rameela Yaddehige | 46ebd4a | 2023-08-09 12:33:21 +0200 | [diff] [blame] | 84 | |
Rameela Yaddehige | 2f996c2 | 2023-08-09 15:04:27 +0200 | [diff] [blame^] | 85 | print(textSigle.text, end='\n') |
Rameela Yaddehige | 46ebd4a | 2023-08-09 12:33:21 +0200 | [diff] [blame] | 86 | |
Rameela Yaddehige | 17a1191 | 2023-08-09 12:44:38 +0200 | [diff] [blame] | 87 | #adding changes to detect multiple authors |
Rameela Yaddehige | 46ebd4a | 2023-08-09 12:33:21 +0200 | [diff] [blame] | 88 | for author in authors : |
| 89 | hauthor = ET.SubElement(analytic, "h.author") |
| 90 | hauthor.text = author.text |
Rameela Yaddehige | 17a1191 | 2023-08-09 12:44:38 +0200 | [diff] [blame] | 91 | #print(hauthor.text, end='\n') |
Rameela Yaddehige | 270f7cc | 2023-07-14 14:12:08 +0200 | [diff] [blame] | 92 | imprint = ET.SubElement(sourceDesc, "imprint") |
Rameela Yaddehige | 2f996c2 | 2023-08-09 15:04:27 +0200 | [diff] [blame^] | 93 | |
| 94 | #adding publisher |
| 95 | |
| 96 | print('Publisher : ',txtPublisher.text, end='\n') |
| 97 | if txtPublisher is None: |
| 98 | print('Publisher : ',txtPublisher.text, end='\n') |
| 99 | else : |
| 100 | publisher = ET.SubElement(imprint, "publisher") |
| 101 | publisher.text = txtPublisher.text |
| 102 | |
| 103 | #adding licenses |
| 104 | for license in licenses: |
| 105 | print('License : ',license.text,end='\n') |
| 106 | |
Rameela Yaddehige | 270f7cc | 2023-07-14 14:12:08 +0200 | [diff] [blame] | 107 | pubDateYear = ET.SubElement(imprint, "pubDate") |
| 108 | pubDateYear.set("type", "year") |
| 109 | pubDateYear.text = dates.text |
| 110 | pubDateMonth = ET.SubElement(imprint, "pubDate") |
| 111 | pubDateMonth.set("type", "month") |
| 112 | #pubDateMonth.text = datesPublished[i].text |
| 113 | pubDateDay = ET.SubElement(imprint, "pubDate") |
| 114 | pubDateDay.set("type", "day") |
| 115 | #pubDateDay.text = datesPublished[i].text |
| 116 | pubPlace = ET.SubElement(imprint, "pubPlace") |
| 117 | idno = ET.SubElement(pubPlace, "idno") |
| 118 | idno.set("type", "URI") |
| 119 | idno.text = uris.text |
| 120 | domain = ET.SubElement(titleStmt, "domain") |
| 121 | domain.text = domains.get("type") |
Rameela Yaddehige | 2a20eac | 2023-07-14 17:10:32 +0200 | [diff] [blame] | 122 | ''' |
Rameela Yaddehige | 270f7cc | 2023-07-14 14:12:08 +0200 | [diff] [blame] | 123 | splitFName = file.split("/") |
| 124 | if (splitFName[-2] in ['Persuasive', 'Blog']): |
| 125 | domain.text = splitFName[-2] |
| 126 | elif(splitFName[-4] != "Originaldaten"): |
| 127 | domain.text = splitFName[-4] + ':' + splitFName[-3] + ':' + splitFName[-2] |
| 128 | else: |
| 129 | domain.text = splitFName[-3] + ':' + splitFName[-2] |
Rameela Yaddehige | 2a20eac | 2023-07-14 17:10:32 +0200 | [diff] [blame] | 130 | ''' |
Rameela Yaddehige | 270f7cc | 2023-07-14 14:12:08 +0200 | [diff] [blame] | 131 | domain.text = domain.text.replace("_hobbies", "_Hobbies") |
| 132 | text = ET.SubElement(tei, "text") |
Rameela Yaddehige | 46ebd4a | 2023-08-09 12:33:21 +0200 | [diff] [blame] | 133 | text.append(texts[i]) |
| 134 | |
Rameela Yaddehige | 270f7cc | 2023-07-14 14:12:08 +0200 | [diff] [blame] | 135 | |
| 136 | return tree |
| 137 | |
| 138 | |
| 139 | main() |