lora-sp | 2e2a21a | 2023-03-30 13:54:59 +0200 | [diff] [blame] | 1 | import os, sys |
| 2 | import xml.etree.ElementTree as ET |
| 3 | from xml.dom import minidom |
| 4 | |
| 5 | |
| 6 | def main(): |
| 7 | # Create corpus structure from string and save into file |
| 8 | corpus = "<teiCorpus>\n</teiCorpus>" |
| 9 | origRoot = ET.fromstring(corpus) |
| 10 | corpusStr = minidom.parseString(ET.tostring(origRoot)).toprettyxml(indent=" ") |
| 11 | if not os.path.exists('input'): |
| 12 | os.makedirs("input") |
| 13 | if not os.path.exists('output'): |
| 14 | os.makedirs("output") |
| 15 | with open("input/tree_structure.xml", "w") as f: |
| 16 | f.write(corpusStr) |
| 17 | |
| 18 | # Parse corpus tree |
| 19 | corpusTree = ET.parse("input/tree_structure.xml") |
| 20 | corpusRoot = corpusTree.getroot() |
| 21 | |
| 22 | # Process documents and append to corpus tree |
| 23 | for j in range(1, len(sys.argv)): |
| 24 | try: |
| 25 | currentTree = convert(j-1, sys.argv[j]) |
| 26 | currentRoot = currentTree.getroot() |
| 27 | corpusRoot.append(currentRoot) |
| 28 | except: |
| 29 | print(sys.argv[j]) |
| 30 | continue |
| 31 | |
| 32 | # Indent and save tree |
| 33 | ET.indent(corpusTree, " ") |
| 34 | corpusTree.write("output/corpus.p5.xml", encoding='utf-8', xml_declaration=True, method='xml', short_empty_elements=True) |
| 35 | |
| 36 | |
| 37 | def convert(j, file): |
| 38 | # Parse document tree and get root |
| 39 | tree = ET.parse(file) |
| 40 | root = tree.getroot() |
| 41 | ET.register_namespace("", "http://www.tei-c.org/ns/1.0") |
| 42 | |
| 43 | # Store metadata and texts in lists |
| 44 | titles = root.find(".//*{http://www.tei-c.org/ns/1.0}bibl/{http://www.tei-c.org/ns/1.0}title") |
| 45 | uris = root.find(".//*{http://www.tei-c.org/ns/1.0}bibl/{http://www.tei-c.org/ns/1.0}pubPlace") |
| 46 | authors = root.find(".//*{http://www.tei-c.org/ns/1.0}bibl/{http://www.tei-c.org/ns/1.0}author") |
| 47 | dates = root.find(".//*{http://www.tei-c.org/ns/1.0}bibl/{http://www.tei-c.org/ns/1.0}date") |
| 48 | texts = root.findall(".//*{http://www.tei-c.org/ns/1.0}body/{http://www.tei-c.org/ns/1.0}div") |
| 49 | domains = root.find(".//*[@type]") |
| 50 | |
| 51 | # Count text elements |
| 52 | number_of_texts = 0 |
| 53 | for text in root.iter("{http://www.tei-c.org/ns/1.0}text"): |
| 54 | for body in text: |
| 55 | for div in body: |
| 56 | number_of_texts+=1 |
| 57 | |
| 58 | # Remove all elements from root |
| 59 | for elem in root.findall("*"): |
| 60 | root.remove(elem) |
| 61 | |
| 62 | # Rename root |
| 63 | root.tag = "teiDoc" |
| 64 | |
| 65 | # Create target structure |
| 66 | for i in range(number_of_texts): |
| 67 | tei = ET.SubElement(root, "TEI") |
| 68 | teiHeader = ET.SubElement(tei, "teiHeader") |
| 69 | fileDesc = ET.SubElement(teiHeader, "fileDesc") |
| 70 | titleStmt = ET.SubElement(fileDesc, "titleStmt") |
| 71 | textSigle = ET.SubElement(titleStmt, "textSigle") |
Marc Kupietz | 386c8af | 2023-05-23 17:56:42 +0200 | [diff] [blame] | 72 | textSigle.text = "NOR/" + f"{j:03}" + "." + f"{i:05}" |
lora-sp | 2e2a21a | 2023-03-30 13:54:59 +0200 | [diff] [blame] | 73 | sourceDesc = ET.SubElement(fileDesc, "sourceDesc") |
| 74 | analytic = ET.SubElement(sourceDesc, "analytic") |
| 75 | title = ET.SubElement(titleStmt, "title") |
| 76 | title.text = titles.text |
| 77 | hauthor = ET.SubElement(analytic, "h.author") |
| 78 | hauthor.text = authors.text |
| 79 | imprint = ET.SubElement(sourceDesc, "imprint") |
| 80 | pubDateYear = ET.SubElement(imprint, "pubDate") |
| 81 | pubDateYear.set("type", "year") |
| 82 | pubDateYear.text = dates.text |
| 83 | pubDateMonth = ET.SubElement(imprint, "pubDate") |
| 84 | pubDateMonth.set("type", "month") |
| 85 | #pubDateMonth.text = datesPublished[i].text |
| 86 | pubDateDay = ET.SubElement(imprint, "pubDate") |
| 87 | pubDateDay.set("type", "day") |
| 88 | #pubDateDay.text = datesPublished[i].text |
| 89 | pubPlace = ET.SubElement(imprint, "pubPlace") |
| 90 | idno = ET.SubElement(pubPlace, "idno") |
| 91 | idno.set("type", "URI") |
| 92 | idno.text = uris.text |
| 93 | domain = ET.SubElement(titleStmt, "domain") |
| 94 | domain.text = domains.get("type") |
Marc Kupietz | 1d9fd6b | 2023-05-23 17:54:20 +0200 | [diff] [blame] | 95 | splitFName = file.split("/") |
| 96 | if (splitFName[-2] in ['Persuasive', 'Blog']): |
| 97 | domain.text = splitFName[-2] |
| 98 | else: |
| 99 | domain.text = splitFName[-3] + ':' + splitFName[-2] |
lora-sp | 2e2a21a | 2023-03-30 13:54:59 +0200 | [diff] [blame] | 100 | text = ET.SubElement(tei, "text") |
| 101 | text.append(texts[i]) |
| 102 | |
| 103 | return tree |
| 104 | |
| 105 | |
| 106 | main() |