| import os, sys |
| import xml.etree.ElementTree as ET |
| from xml.dom import minidom |
| import traceback |
| import sys |
| |
| def main(): |
| # Create corpus structure from string and save into file |
| corpus = "<teiCorpus>\n</teiCorpus>" |
| origRoot = ET.fromstring(corpus) |
| corpusStr = minidom.parseString(ET.tostring(origRoot)).toprettyxml(indent=" ") |
| if not os.path.exists('input'): |
| os.makedirs("input") |
| if not os.path.exists('output'): |
| os.makedirs("output") |
| with open("input/tree_structure.xml", "w") as f: |
| f.write(corpusStr) |
| |
| # Parse corpus tree |
| corpusTree = ET.parse("input/tree_structure.xml") |
| corpusRoot = corpusTree.getroot() |
| |
| # Process documents and append to corpus tree |
| for j in range(1, len(sys.argv)): |
| try: |
| currentTree = convert(j-1, sys.argv[j]) |
| currentRoot = currentTree.getroot() |
| corpusRoot.append(currentRoot) |
| except: |
| print("ERROR:"+sys.argv[j]) |
| print(traceback.format_exc()) |
| print(sys.exc_info()[2]) |
| continue |
| |
| # Indent and save tree |
| ET.indent(corpusTree, " ") |
| corpusTree.write("output/corpus.p5.xml", encoding='utf-8', xml_declaration=True, method='xml', short_empty_elements=True) |
| |
| |
| def convert(j, file): |
| # Parse document tree and get root |
| tree = ET.parse(file) |
| root = tree.getroot() |
| ET.register_namespace("", "http://www.tei-c.org/ns/1.0") |
| |
| # Store metadata and texts in lists |
| titles = root.find(".//*{http://www.tei-c.org/ns/1.0}bibl/{http://www.tei-c.org/ns/1.0}title") |
| uris = root.find(".//*{http://www.tei-c.org/ns/1.0}bibl/{http://www.tei-c.org/ns/1.0}pubPlace") |
| authors = root.find(".//*{http://www.tei-c.org/ns/1.0}bibl/{http://www.tei-c.org/ns/1.0}author") |
| dates = root.find(".//*{http://www.tei-c.org/ns/1.0}bibl/{http://www.tei-c.org/ns/1.0}date") |
| texts = root.findall(".//*{http://www.tei-c.org/ns/1.0}body/{http://www.tei-c.org/ns/1.0}div") |
| domains = root.find(".//*[@type]") |
| |
| # Count text elements |
| number_of_texts = 0 |
| for text in root.iter("{http://www.tei-c.org/ns/1.0}text"): |
| for body in text: |
| for div in body: |
| number_of_texts+=1 |
| |
| # Remove all elements from root |
| for elem in root.findall("*"): |
| root.remove(elem) |
| |
| # Rename root |
| root.tag = "teiDoc" |
| |
| # Create target structure |
| for i in range(number_of_texts): |
| tei = ET.SubElement(root, "TEI") |
| teiHeader = ET.SubElement(tei, "teiHeader") |
| fileDesc = ET.SubElement(teiHeader, "fileDesc") |
| titleStmt = ET.SubElement(fileDesc, "titleStmt") |
| textSigle = ET.SubElement(titleStmt, "textSigle") |
| textSigle.text = "NOR/" + f"{j:03}" + "." + f"{i:05}" |
| sourceDesc = ET.SubElement(fileDesc, "sourceDesc") |
| analytic = ET.SubElement(sourceDesc, "analytic") |
| title = ET.SubElement(titleStmt, "title") |
| title.text = titles.text |
| hauthor = ET.SubElement(analytic, "h.author") |
| hauthor.text = authors.text |
| imprint = ET.SubElement(sourceDesc, "imprint") |
| pubDateYear = ET.SubElement(imprint, "pubDate") |
| pubDateYear.set("type", "year") |
| pubDateYear.text = dates.text |
| pubDateMonth = ET.SubElement(imprint, "pubDate") |
| pubDateMonth.set("type", "month") |
| #pubDateMonth.text = datesPublished[i].text |
| pubDateDay = ET.SubElement(imprint, "pubDate") |
| pubDateDay.set("type", "day") |
| #pubDateDay.text = datesPublished[i].text |
| pubPlace = ET.SubElement(imprint, "pubPlace") |
| idno = ET.SubElement(pubPlace, "idno") |
| idno.set("type", "URI") |
| idno.text = uris.text |
| domain = ET.SubElement(titleStmt, "domain") |
| domain.text = domains.get("type") |
| splitFName = file.split("/") |
| if (splitFName[-2] in ['Persuasive', 'Blog']): |
| domain.text = splitFName[-2] |
| elif(splitFName[-4] != "Originaldaten"): |
| domain.text = splitFName[-4] + ':' + splitFName[-3] + ':' + splitFName[-2] |
| else: |
| domain.text = splitFName[-3] + ':' + splitFName[-2] |
| domain.text = domain.text.replace("_hobbies", "_Hobbies") |
| text = ET.SubElement(tei, "text") |
| text.append(texts[i]) |
| |
| return tree |
| |
| |
| main() |