blob: 830e71592b9e9fc8fed19f7f24fd6ffad0113326 [file] [log] [blame]
import sys
import xml.etree.ElementTree as ET
def main():
corpusRoot = ET.Element("teiCorpus")
for j in range(1, len(sys.argv)):
try:
doc_data = extract_data(sys.argv[j])
doc_tree = create_tree(doc_data, j-1)
currentRoot = doc_tree.getroot()
corpusRoot.append(currentRoot)
except:
print("Warning: could not parse file: " + sys.argv[j], file=sys.stderr)
continue
corpusTree = ET.ElementTree(corpusRoot)
ET.indent(corpusTree, " ")
corpusTree.write(sys.stdout, encoding='unicode')
def extract_data(file):
''' Parses an xml file and saves the metadata and texts into a dictionary that is returned.
The dictionary is of the following form:
data = {filenumber: {title}, {url}, {author}, {date}, {time}, {text}}
'''
tree = ET.parse(file)
root = tree.getroot()
ET.register_namespace("", "http://www.tei-c.org/ns/1.0")
data = {}
for i, text in enumerate(root.iter("{http://www.tei-c.org/ns/1.0}text")):
data[i] = {}
data[i]['title'] = text.get('title')
data[i]['url'] = text.get('url')
data[i]['author'] = text.get('author')
data[i]['date'] = text.get('date').split(' ')[0]
data[i]['time'] = text.get('date').split(' ')[1]
textelem = text.find(".{http://www.tei-c.org/ns/1.0}body/{http://www.tei-c.org/ns/1.0}div/"\
"{http://www.tei-c.org/ns/1.0}div")
data[i]['text'] = []
for p in textelem.findall(".{http://www.tei-c.org/ns/1.0}p"):
data[i]['text'].append(p)
return data
def create_tree(data, filenumber):
''' Receives a dictionary containing the data and returns an xml tree in the
desired format. Generates text sigles of the following format: BNC/filenumber.textnumber,
e.g. BNC/000.00000
'''
docRoot = ET.Element("teiDoc")
for i in range(len(data)):
tei = ET.SubElement(docRoot, "TEI")
teiHeader = ET.SubElement(tei, "teiHeader")
fileDesc = ET.SubElement(teiHeader, "fileDesc")
titleStmt = ET.SubElement(fileDesc, "titleStmt")
textSigle = ET.SubElement(titleStmt, "textSigle")
textSigle.text = "BNC/" + f"{filenumber:03}" + "." + f"{i:05}"
sourceDesc = ET.SubElement(fileDesc, "sourceDesc")
analytic = ET.SubElement(sourceDesc, "analytic")
htitle = ET.SubElement(analytic, "h.title")
htitle.text = data[i]['title']
hauthor = ET.SubElement(analytic, "h.author")
hauthor.text = data[i]['author']
imprint = ET.SubElement(sourceDesc, "imprint")
pubDateYear = ET.SubElement(imprint, "pubDate")
pubDateYear.set("type", "year")
pubDateYear.text = data[i]['date'][0:4]
pubDateMonth = ET.SubElement(imprint, "pubDate")
pubDateMonth.set("type", "month")
pubDateMonth.text = data[i]['date'][5:7]
pubDateDay = ET.SubElement(imprint, "pubDate")
pubDateDay.set("type", "day")
pubDateDay.text = data[i]['date'][8:10]
pubDateTime = ET.SubElement(imprint, "pubDate")
pubDateTime.set("type", "time")
pubDateTime.text = data[i]['time']
pubPlace = ET.SubElement(imprint, "pubPlace")
ref = ET.SubElement(pubPlace, "ref")
ref.set("type", "page_url")
ref.set("target", data[i]['url'])
text = ET.SubElement(tei, "text")
body = ET.SubElement(text, "body")
for p in data[i]['text']:
body.append(p)
docTree = ET.ElementTree(docRoot)
ET.indent(docTree, " ")
return docTree
main()