blob: 2be37ea786440eb6ef92e84a19ab9e0ea834f31d [file] [log] [blame]
import os, sys
import xml.etree.ElementTree as ET
from xml.dom import minidom
import traceback
import sys
def main():
# Create corpus structure from string and save into file
corpus = "<teiCorpus>\n</teiCorpus>"
origRoot = ET.fromstring(corpus)
corpusStr = minidom.parseString(ET.tostring(origRoot)).toprettyxml(indent=" ")
if not os.path.exists('input'):
os.makedirs("input")
if not os.path.exists('output'):
os.makedirs("output")
with open("input/tree_structure.xml", "w") as f:
f.write(corpusStr)
# Parse corpus tree
corpusTree = ET.parse("input/tree_structure.xml")
corpusRoot = corpusTree.getroot()
# Process documents and append to corpus tree
for j in range(1, len(sys.argv)):
try:
currentTree = convert(j-1, sys.argv[j])
currentRoot = currentTree.getroot()
corpusRoot.append(currentRoot)
except:
print("ERROR:"+sys.argv[j])
print(traceback.format_exc())
print(sys.exc_info()[2])
continue
# Indent and save tree
ET.indent(corpusTree, " ")
corpusTree.write("output/GA_corpus.p5.xml", encoding='utf-8', xml_declaration=True, method='xml', short_empty_elements=True)
def convert(j, file):
# Parse document tree and get root
tree = ET.parse(file)
root = tree.getroot()
ET.register_namespace("", "http://www.tei-c.org/ns/1.0")
# Store metadata and texts in lists
titles = root.find(".//*{http://www.tei-c.org/ns/1.0}bibl/{http://www.tei-c.org/ns/1.0}title")
uris = root.find(".//*{http://www.tei-c.org/ns/1.0}bibl/{http://www.tei-c.org/ns/1.0}pubPlace")
authors = root.find(".//*{http://www.tei-c.org/ns/1.0}bibl/{http://www.tei-c.org/ns/1.0}author")
dates = root.find(".//*{http://www.tei-c.org/ns/1.0}bibl/{http://www.tei-c.org/ns/1.0}date")
texts = root.findall(".//*{http://www.tei-c.org/ns/1.0}body/{http://www.tei-c.org/ns/1.0}div")
domains = root.find(".//*[@type]")
# Count text elements
number_of_texts = 0
for text in root.iter("{http://www.tei-c.org/ns/1.0}text"):
for body in text:
for div in body:
number_of_texts+=1
number_of_texts = len(texts)
# Remove all elements from root
for elem in root.findall("*"):
root.remove(elem)
# Rename root
root.tag = "teiDoc"
# Create target structure
for i in range(number_of_texts):
tei = ET.SubElement(root, "TEI")
teiHeader = ET.SubElement(tei, "teiHeader")
fileDesc = ET.SubElement(teiHeader, "fileDesc")
titleStmt = ET.SubElement(fileDesc, "titleStmt")
textSigle = ET.SubElement(titleStmt, "textSigle")
textSigle.text = "GA/" + f"{j:03}" + "." + f"{i:05}"
sourceDesc = ET.SubElement(fileDesc, "sourceDesc")
analytic = ET.SubElement(sourceDesc, "analytic")
title = ET.SubElement(titleStmt, "title")
title.text = titles.text
hauthor = ET.SubElement(analytic, "h.author")
hauthor.text = authors.text
imprint = ET.SubElement(sourceDesc, "imprint")
pubDateYear = ET.SubElement(imprint, "pubDate")
pubDateYear.set("type", "year")
pubDateYear.text = dates.text
pubDateMonth = ET.SubElement(imprint, "pubDate")
pubDateMonth.set("type", "month")
#pubDateMonth.text = datesPublished[i].text
pubDateDay = ET.SubElement(imprint, "pubDate")
pubDateDay.set("type", "day")
#pubDateDay.text = datesPublished[i].text
pubPlace = ET.SubElement(imprint, "pubPlace")
idno = ET.SubElement(pubPlace, "idno")
idno.set("type", "URI")
idno.text = uris.text
domain = ET.SubElement(titleStmt, "domain")
domain.text = domains.get("type")
splitFName = file.split("/")
if (splitFName[-2] in ['Persuasive', 'Blog']):
domain.text = splitFName[-2]
elif(splitFName[-4] != "Originaldaten"):
domain.text = splitFName[-4] + ':' + splitFName[-3] + ':' + splitFName[-2]
else:
domain.text = splitFName[-3] + ':' + splitFName[-2]
domain.text = domain.text.replace("_hobbies", "_Hobbies")
text = ET.SubElement(tei, "text")
text.append(texts[i])
return tree
main()