Rameela Yaddehige | 270f7cc | 2023-07-14 14:12:08 +0200 | [diff] [blame] | 1 | import os, sys |
| 2 | import xml.etree.ElementTree as ET |
| 3 | from xml.dom import minidom |
| 4 | import traceback |
| 5 | import sys |
| 6 | |
| 7 | def main(): |
| 8 | # Create corpus structure from string and save into file |
| 9 | corpus = "<teiCorpus>\n</teiCorpus>" |
| 10 | origRoot = ET.fromstring(corpus) |
| 11 | corpusStr = minidom.parseString(ET.tostring(origRoot)).toprettyxml(indent=" ") |
| 12 | if not os.path.exists('input'): |
| 13 | os.makedirs("input") |
| 14 | if not os.path.exists('output'): |
| 15 | os.makedirs("output") |
| 16 | with open("input/tree_structure.xml", "w") as f: |
| 17 | f.write(corpusStr) |
| 18 | |
| 19 | # Parse corpus tree |
| 20 | corpusTree = ET.parse("input/tree_structure.xml") |
| 21 | corpusRoot = corpusTree.getroot() |
| 22 | |
| 23 | # Process documents and append to corpus tree |
| 24 | for j in range(1, len(sys.argv)): |
| 25 | try: |
| 26 | currentTree = convert(j-1, sys.argv[j]) |
| 27 | currentRoot = currentTree.getroot() |
| 28 | corpusRoot.append(currentRoot) |
| 29 | except: |
| 30 | print("ERROR:"+sys.argv[j]) |
| 31 | print(traceback.format_exc()) |
| 32 | print(sys.exc_info()[2]) |
| 33 | continue |
| 34 | |
| 35 | # Indent and save tree |
| 36 | ET.indent(corpusTree, " ") |
Rameela Yaddehige | 6c0ff87 | 2023-07-14 14:46:16 +0200 | [diff] [blame] | 37 | corpusTree.write("output/GA_corpus.p5.xml", encoding='utf-8', xml_declaration=True, method='xml', short_empty_elements=True) |
Rameela Yaddehige | 270f7cc | 2023-07-14 14:12:08 +0200 | [diff] [blame] | 38 | |
| 39 | |
| 40 | def convert(j, file): |
| 41 | # Parse document tree and get root |
| 42 | tree = ET.parse(file) |
| 43 | root = tree.getroot() |
| 44 | ET.register_namespace("", "http://www.tei-c.org/ns/1.0") |
| 45 | |
| 46 | # Store metadata and texts in lists |
| 47 | titles = root.find(".//*{http://www.tei-c.org/ns/1.0}bibl/{http://www.tei-c.org/ns/1.0}title") |
| 48 | uris = root.find(".//*{http://www.tei-c.org/ns/1.0}bibl/{http://www.tei-c.org/ns/1.0}pubPlace") |
Rameela Yaddehige | 46ebd4a | 2023-08-09 12:33:21 +0200 | [diff] [blame^] | 49 | authors = root.findall(".//*{http://www.tei-c.org/ns/1.0}bibl/{http://www.tei-c.org/ns/1.0}author") |
| 50 | |
Rameela Yaddehige | 270f7cc | 2023-07-14 14:12:08 +0200 | [diff] [blame] | 51 | dates = root.find(".//*{http://www.tei-c.org/ns/1.0}bibl/{http://www.tei-c.org/ns/1.0}date") |
| 52 | texts = root.findall(".//*{http://www.tei-c.org/ns/1.0}body/{http://www.tei-c.org/ns/1.0}div") |
| 53 | domains = root.find(".//*[@type]") |
| 54 | |
| 55 | # Count text elements |
| 56 | number_of_texts = 0 |
| 57 | for text in root.iter("{http://www.tei-c.org/ns/1.0}text"): |
| 58 | for body in text: |
| 59 | for div in body: |
| 60 | number_of_texts+=1 |
| 61 | |
| 62 | number_of_texts = len(texts) |
| 63 | # Remove all elements from root |
| 64 | for elem in root.findall("*"): |
| 65 | root.remove(elem) |
| 66 | |
| 67 | # Rename root |
| 68 | root.tag = "teiDoc" |
| 69 | |
| 70 | # Create target structure |
| 71 | for i in range(number_of_texts): |
| 72 | tei = ET.SubElement(root, "TEI") |
| 73 | teiHeader = ET.SubElement(tei, "teiHeader") |
| 74 | fileDesc = ET.SubElement(teiHeader, "fileDesc") |
| 75 | titleStmt = ET.SubElement(fileDesc, "titleStmt") |
| 76 | textSigle = ET.SubElement(titleStmt, "textSigle") |
| 77 | textSigle.text = "GA/" + f"{j:03}" + "." + f"{i:05}" |
| 78 | sourceDesc = ET.SubElement(fileDesc, "sourceDesc") |
| 79 | analytic = ET.SubElement(sourceDesc, "analytic") |
| 80 | title = ET.SubElement(titleStmt, "title") |
| 81 | title.text = titles.text |
Rameela Yaddehige | 46ebd4a | 2023-08-09 12:33:21 +0200 | [diff] [blame^] | 82 | |
| 83 | print(textSigle.text, end='\n') |
| 84 | |
| 85 | |
| 86 | for author in authors : |
| 87 | hauthor = ET.SubElement(analytic, "h.author") |
| 88 | hauthor.text = author.text |
| 89 | print(hauthor.text, end='\n') |
Rameela Yaddehige | 270f7cc | 2023-07-14 14:12:08 +0200 | [diff] [blame] | 90 | imprint = ET.SubElement(sourceDesc, "imprint") |
| 91 | pubDateYear = ET.SubElement(imprint, "pubDate") |
| 92 | pubDateYear.set("type", "year") |
| 93 | pubDateYear.text = dates.text |
| 94 | pubDateMonth = ET.SubElement(imprint, "pubDate") |
| 95 | pubDateMonth.set("type", "month") |
| 96 | #pubDateMonth.text = datesPublished[i].text |
| 97 | pubDateDay = ET.SubElement(imprint, "pubDate") |
| 98 | pubDateDay.set("type", "day") |
| 99 | #pubDateDay.text = datesPublished[i].text |
| 100 | pubPlace = ET.SubElement(imprint, "pubPlace") |
| 101 | idno = ET.SubElement(pubPlace, "idno") |
| 102 | idno.set("type", "URI") |
| 103 | idno.text = uris.text |
| 104 | domain = ET.SubElement(titleStmt, "domain") |
| 105 | domain.text = domains.get("type") |
Rameela Yaddehige | 2a20eac | 2023-07-14 17:10:32 +0200 | [diff] [blame] | 106 | ''' |
Rameela Yaddehige | 270f7cc | 2023-07-14 14:12:08 +0200 | [diff] [blame] | 107 | splitFName = file.split("/") |
| 108 | if (splitFName[-2] in ['Persuasive', 'Blog']): |
| 109 | domain.text = splitFName[-2] |
| 110 | elif(splitFName[-4] != "Originaldaten"): |
| 111 | domain.text = splitFName[-4] + ':' + splitFName[-3] + ':' + splitFName[-2] |
| 112 | else: |
| 113 | domain.text = splitFName[-3] + ':' + splitFName[-2] |
Rameela Yaddehige | 2a20eac | 2023-07-14 17:10:32 +0200 | [diff] [blame] | 114 | ''' |
Rameela Yaddehige | 270f7cc | 2023-07-14 14:12:08 +0200 | [diff] [blame] | 115 | domain.text = domain.text.replace("_hobbies", "_Hobbies") |
| 116 | text = ET.SubElement(tei, "text") |
Rameela Yaddehige | 46ebd4a | 2023-08-09 12:33:21 +0200 | [diff] [blame^] | 117 | text.append(texts[i]) |
| 118 | |
Rameela Yaddehige | 270f7cc | 2023-07-14 14:12:08 +0200 | [diff] [blame] | 119 | |
| 120 | return tree |
| 121 | |
| 122 | |
| 123 | main() |