bunc2tei.py - lora-sp/bunc2tei - Gitiles

 import sys
 import xml.etree.ElementTree as ET


 def main():
     corpusRoot = ET.Element("teiCorpus")

     for j in range(1, len(sys.argv)):
         try:
             doc_data = extract_data(sys.argv[j])
             doc_tree = create_tree(doc_data, j-1)
             currentRoot = doc_tree.getroot()
             corpusRoot.append(currentRoot)
         except:
             print("Warning: could not parse file: " + sys.argv[j], file=sys.stderr)
             continue

     corpusTree = ET.ElementTree(corpusRoot)
     ET.indent(corpusTree, "  ")
     corpusTree.write(sys.stdout, encoding='unicode')


 def extract_data(file):
     ''' Parses an xml file and saves the metadata and texts into a dictionary that is returned.
     The dictionary is of the following form:
     data = {filenumber: {title}, {url}, {author}, {date}, {time}, {text}}
     '''

     tree = ET.parse(file)
     root = tree.getroot()
     ET.register_namespace("", "http://www.tei-c.org/ns/1.0")

     data = {}

     for i, text in enumerate(root.iter("{http://www.tei-c.org/ns/1.0}text")):
         data[i] = {}
         data[i]['title'] = text.get('title')
         data[i]['url'] = text.get('url')
         data[i]['author'] = text.get('author')
         data[i]['date'] = text.get('date').split(' ')[0]
         data[i]['time'] = text.get('date').split(' ')[1]

         textelem = text.find(".{http://www.tei-c.org/ns/1.0}body/{http://www.tei-c.org/ns/1.0}div/"\
                              "{http://www.tei-c.org/ns/1.0}div")

         data[i]['text'] = []
         for p in textelem.findall(".{http://www.tei-c.org/ns/1.0}p"):
             data[i]['text'].append(p)

     return data


 def create_tree(data, filenumber):
     ''' Receives a dictionary containing the data and returns an xml tree in the
     desired format. Generates text sigles of the following format: BNC/filenumber.textnumber,
     e.g. BNC/000.00000
     '''
     docRoot = ET.Element("teiDoc")

     for i in range(len(data)):
         tei = ET.SubElement(docRoot, "TEI")
         teiHeader = ET.SubElement(tei, "teiHeader")
         fileDesc = ET.SubElement(teiHeader, "fileDesc")
         titleStmt = ET.SubElement(fileDesc, "titleStmt")
         textSigle = ET.SubElement(titleStmt, "textSigle")
         textSigle.text = "BNC/" + f"{filenumber:03}" + "." + f"{i:05}"
         sourceDesc = ET.SubElement(fileDesc, "sourceDesc")
         analytic = ET.SubElement(sourceDesc, "analytic")
         htitle = ET.SubElement(analytic, "h.title")
         htitle.text = data[i]['title']
         hauthor = ET.SubElement(analytic, "h.author")
         hauthor.text = data[i]['author']
         imprint = ET.SubElement(sourceDesc, "imprint")
         pubDateYear = ET.SubElement(imprint, "pubDate")
         pubDateYear.set("type", "year")
         pubDateYear.text = data[i]['date'][0:4]
         pubDateMonth = ET.SubElement(imprint, "pubDate")
         pubDateMonth.set("type", "month")
         pubDateMonth.text = data[i]['date'][5:7]
         pubDateDay = ET.SubElement(imprint, "pubDate")
         pubDateDay.set("type", "day")
         pubDateDay.text = data[i]['date'][8:10]
         pubDateTime = ET.SubElement(imprint, "pubDate")
         pubDateTime.set("type", "time")
         pubDateTime.text = data[i]['time']
         pubPlace = ET.SubElement(imprint, "pubPlace")
         ref = ET.SubElement(pubPlace, "ref")
         ref.set("type", "page_url")
         ref.set("target", data[i]['url'])
         text = ET.SubElement(tei, "text")
         body = ET.SubElement(text, "body")
         for p in data[i]['text']:
             body.append(p)

     docTree = ET.ElementTree(docRoot)
     ET.indent(docTree, "  ")

     return docTree


 main()
	import sys
	import xml.etree.ElementTree as ET


	def main():
	corpusRoot = ET.Element("teiCorpus")

	for j in range(1, len(sys.argv)):
	try:
	doc_data = extract_data(sys.argv[j])
	doc_tree = create_tree(doc_data, j-1)
	currentRoot = doc_tree.getroot()
	corpusRoot.append(currentRoot)
	except:
	print("Warning: could not parse file: " + sys.argv[j], file=sys.stderr)
	continue

	corpusTree = ET.ElementTree(corpusRoot)
	ET.indent(corpusTree, " ")
	corpusTree.write(sys.stdout, encoding='unicode')


	def extract_data(file):
	''' Parses an xml file and saves the metadata and texts into a dictionary that is returned.
	The dictionary is of the following form:
	data = {filenumber: {title}, {url}, {author}, {date}, {time}, {text}}
	'''

	tree = ET.parse(file)
	root = tree.getroot()
	ET.register_namespace("", "http://www.tei-c.org/ns/1.0")

	data = {}

	for i, text in enumerate(root.iter("{http://www.tei-c.org/ns/1.0}text")):
	data[i] = {}
	data[i]['title'] = text.get('title')
	data[i]['url'] = text.get('url')
	data[i]['author'] = text.get('author')
	data[i]['date'] = text.get('date').split(' ')[0]
	data[i]['time'] = text.get('date').split(' ')[1]

	textelem = text.find(".{http://www.tei-c.org/ns/1.0}body/{http://www.tei-c.org/ns/1.0}div/"\
	"{http://www.tei-c.org/ns/1.0}div")

	data[i]['text'] = []
	for p in textelem.findall(".{http://www.tei-c.org/ns/1.0}p"):
	data[i]['text'].append(p)

	return data


	def create_tree(data, filenumber):
	''' Receives a dictionary containing the data and returns an xml tree in the
	desired format. Generates text sigles of the following format: BNC/filenumber.textnumber,
	e.g. BNC/000.00000
	'''
	docRoot = ET.Element("teiDoc")

	for i in range(len(data)):
	tei = ET.SubElement(docRoot, "TEI")
	teiHeader = ET.SubElement(tei, "teiHeader")
	fileDesc = ET.SubElement(teiHeader, "fileDesc")
	titleStmt = ET.SubElement(fileDesc, "titleStmt")
	textSigle = ET.SubElement(titleStmt, "textSigle")
	textSigle.text = "BNC/" + f"{filenumber:03}" + "." + f"{i:05}"
	sourceDesc = ET.SubElement(fileDesc, "sourceDesc")
	analytic = ET.SubElement(sourceDesc, "analytic")
	htitle = ET.SubElement(analytic, "h.title")
	htitle.text = data[i]['title']
	hauthor = ET.SubElement(analytic, "h.author")
	hauthor.text = data[i]['author']
	imprint = ET.SubElement(sourceDesc, "imprint")
	pubDateYear = ET.SubElement(imprint, "pubDate")
	pubDateYear.set("type", "year")
	pubDateYear.text = data[i]['date'][0:4]
	pubDateMonth = ET.SubElement(imprint, "pubDate")
	pubDateMonth.set("type", "month")
	pubDateMonth.text = data[i]['date'][5:7]
	pubDateDay = ET.SubElement(imprint, "pubDate")
	pubDateDay.set("type", "day")
	pubDateDay.text = data[i]['date'][8:10]
	pubDateTime = ET.SubElement(imprint, "pubDate")
	pubDateTime.set("type", "time")
	pubDateTime.text = data[i]['time']
	pubPlace = ET.SubElement(imprint, "pubPlace")
	ref = ET.SubElement(pubPlace, "ref")
	ref.set("type", "page_url")
	ref.set("target", data[i]['url'])
	text = ET.SubElement(tei, "text")
	body = ET.SubElement(text, "body")
	for p in data[i]['text']:
	body.append(p)

	docTree = ET.ElementTree(docRoot)
	ET.indent(docTree, " ")

	return docTree


	main()