nor2tei.py - ICC/nor2korap - Gitiles

 import os, sys
 import xml.etree.ElementTree as ET
 from xml.dom import minidom


 def main():
     # Create corpus structure from string and save into file
     corpus = "<teiCorpus>\n</teiCorpus>"
     origRoot = ET.fromstring(corpus)
     corpusStr = minidom.parseString(ET.tostring(origRoot)).toprettyxml(indent="  ")
     if not os.path.exists('input'):
         os.makedirs("input")
     if not os.path.exists('output'):
         os.makedirs("output")
     with open("input/tree_structure.xml", "w") as f:
         f.write(corpusStr)

     # Parse corpus tree
     corpusTree = ET.parse("input/tree_structure.xml")
     corpusRoot = corpusTree.getroot()

     # Process documents and append to corpus tree
     for j in range(1, len(sys.argv)):
         try:
             currentTree = convert(j-1, sys.argv[j])
             currentRoot = currentTree.getroot()
             corpusRoot.append(currentRoot)
         except:
             print(sys.argv[j])
             continue

     # Indent and save tree
     ET.indent(corpusTree, "  ")
     corpusTree.write("output/corpus.p5.xml", encoding='utf-8', xml_declaration=True, method='xml', short_empty_elements=True)


 def convert(j, file):
     # Parse document tree and get root
     tree = ET.parse(file)
     root = tree.getroot()
     ET.register_namespace("", "http://www.tei-c.org/ns/1.0")

     # Store metadata and texts in lists
     titles = root.find(".//*{http://www.tei-c.org/ns/1.0}bibl/{http://www.tei-c.org/ns/1.0}title")
     uris = root.find(".//*{http://www.tei-c.org/ns/1.0}bibl/{http://www.tei-c.org/ns/1.0}pubPlace")
     authors = root.find(".//*{http://www.tei-c.org/ns/1.0}bibl/{http://www.tei-c.org/ns/1.0}author")
     dates = root.find(".//*{http://www.tei-c.org/ns/1.0}bibl/{http://www.tei-c.org/ns/1.0}date")
     texts = root.findall(".//*{http://www.tei-c.org/ns/1.0}body/{http://www.tei-c.org/ns/1.0}div")
     domains = root.find(".//*[@type]")

     # Count text elements
     number_of_texts = 0
     for text in root.iter("{http://www.tei-c.org/ns/1.0}text"):
         for body in text:
             for div in body:
                 number_of_texts+=1

     # Remove all elements from root
     for elem in root.findall("*"):
         root.remove(elem)

     # Rename root
     root.tag = "teiDoc"

     # Create target structure
     for i in range(number_of_texts):
         tei = ET.SubElement(root, "TEI")
         teiHeader = ET.SubElement(tei, "teiHeader")
         fileDesc = ET.SubElement(teiHeader, "fileDesc")
         titleStmt = ET.SubElement(fileDesc, "titleStmt")
         textSigle = ET.SubElement(titleStmt, "textSigle")
         textSigle.text = "NOR/" + f"{j:03}" + "." + f"{i:05}"
         sourceDesc = ET.SubElement(fileDesc, "sourceDesc")
         analytic = ET.SubElement(sourceDesc, "analytic")
         title = ET.SubElement(titleStmt, "title")
         title.text = titles.text
         hauthor = ET.SubElement(analytic, "h.author")
         hauthor.text = authors.text
         imprint = ET.SubElement(sourceDesc, "imprint")
         pubDateYear = ET.SubElement(imprint, "pubDate")
         pubDateYear.set("type", "year")
         pubDateYear.text = dates.text
         pubDateMonth = ET.SubElement(imprint, "pubDate")
         pubDateMonth.set("type", "month")
         #pubDateMonth.text = datesPublished[i].text
         pubDateDay = ET.SubElement(imprint, "pubDate")
         pubDateDay.set("type", "day")
         #pubDateDay.text = datesPublished[i].text
         pubPlace = ET.SubElement(imprint, "pubPlace")
         idno = ET.SubElement(pubPlace, "idno")
         idno.set("type", "URI")
         idno.text = uris.text
         domain = ET.SubElement(titleStmt, "domain")
         domain.text = domains.get("type")
         splitFName = file.split("/")
         if (splitFName[-2] in ['Persuasive', 'Blog']):
             domain.text = splitFName[-2]
         else:
             domain.text = splitFName[-3] + ':' + splitFName[-2]
         text = ET.SubElement(tei, "text")
         text.append(texts[i])

     return tree


 main()
	import os, sys
	import xml.etree.ElementTree as ET
	from xml.dom import minidom


	def main():
	# Create corpus structure from string and save into file
	corpus = "<teiCorpus>\n</teiCorpus>"
	origRoot = ET.fromstring(corpus)
	corpusStr = minidom.parseString(ET.tostring(origRoot)).toprettyxml(indent=" ")
	if not os.path.exists('input'):
	os.makedirs("input")
	if not os.path.exists('output'):
	os.makedirs("output")
	with open("input/tree_structure.xml", "w") as f:
	f.write(corpusStr)

	# Parse corpus tree
	corpusTree = ET.parse("input/tree_structure.xml")
	corpusRoot = corpusTree.getroot()

	# Process documents and append to corpus tree
	for j in range(1, len(sys.argv)):
	try:
	currentTree = convert(j-1, sys.argv[j])
	currentRoot = currentTree.getroot()
	corpusRoot.append(currentRoot)
	except:
	print(sys.argv[j])
	continue

	# Indent and save tree
	ET.indent(corpusTree, " ")
	corpusTree.write("output/corpus.p5.xml", encoding='utf-8', xml_declaration=True, method='xml', short_empty_elements=True)


	def convert(j, file):
	# Parse document tree and get root
	tree = ET.parse(file)
	root = tree.getroot()
	ET.register_namespace("", "http://www.tei-c.org/ns/1.0")

	# Store metadata and texts in lists
	titles = root.find(".//*{http://www.tei-c.org/ns/1.0}bibl/{http://www.tei-c.org/ns/1.0}title")
	uris = root.find(".//*{http://www.tei-c.org/ns/1.0}bibl/{http://www.tei-c.org/ns/1.0}pubPlace")
	authors = root.find(".//*{http://www.tei-c.org/ns/1.0}bibl/{http://www.tei-c.org/ns/1.0}author")
	dates = root.find(".//*{http://www.tei-c.org/ns/1.0}bibl/{http://www.tei-c.org/ns/1.0}date")
	texts = root.findall(".//*{http://www.tei-c.org/ns/1.0}body/{http://www.tei-c.org/ns/1.0}div")
	domains = root.find(".//*[@type]")

	# Count text elements
	number_of_texts = 0
	for text in root.iter("{http://www.tei-c.org/ns/1.0}text"):
	for body in text:
	for div in body:
	number_of_texts+=1

	# Remove all elements from root
	for elem in root.findall("*"):
	root.remove(elem)

	# Rename root
	root.tag = "teiDoc"

	# Create target structure
	for i in range(number_of_texts):
	tei = ET.SubElement(root, "TEI")
	teiHeader = ET.SubElement(tei, "teiHeader")
	fileDesc = ET.SubElement(teiHeader, "fileDesc")
	titleStmt = ET.SubElement(fileDesc, "titleStmt")
	textSigle = ET.SubElement(titleStmt, "textSigle")
	textSigle.text = "NOR/" + f"{j:03}" + "." + f"{i:05}"
	sourceDesc = ET.SubElement(fileDesc, "sourceDesc")
	analytic = ET.SubElement(sourceDesc, "analytic")
	title = ET.SubElement(titleStmt, "title")
	title.text = titles.text
	hauthor = ET.SubElement(analytic, "h.author")
	hauthor.text = authors.text
	imprint = ET.SubElement(sourceDesc, "imprint")
	pubDateYear = ET.SubElement(imprint, "pubDate")
	pubDateYear.set("type", "year")
	pubDateYear.text = dates.text
	pubDateMonth = ET.SubElement(imprint, "pubDate")
	pubDateMonth.set("type", "month")
	#pubDateMonth.text = datesPublished[i].text
	pubDateDay = ET.SubElement(imprint, "pubDate")
	pubDateDay.set("type", "day")
	#pubDateDay.text = datesPublished[i].text
	pubPlace = ET.SubElement(imprint, "pubPlace")
	idno = ET.SubElement(pubPlace, "idno")
	idno.set("type", "URI")
	idno.text = uris.text
	domain = ET.SubElement(titleStmt, "domain")
	domain.text = domains.get("type")
	splitFName = file.split("/")
	if (splitFName[-2] in ['Persuasive', 'Blog']):
	domain.text = splitFName[-2]
	else:
	domain.text = splitFName[-3] + ':' + splitFName[-2]
	text = ET.SubElement(tei, "text")
	text.append(texts[i])

	return tree


	main()