Blame - nor2tei.py - ICC/nor2korap

blob: 6bbb53b1ded7c555fd5c07c1163bcc41f76284ac [file] [log] [blame]

lora-sp	2e2a21a	2023-03-30 13:54:59 +0200	[diff] [blame]	1	import os, sys
				2	import xml.etree.ElementTree as ET
				3	from xml.dom import minidom
				4
				5
				6	def main():
				7	# Create corpus structure from string and save into file
				8	corpus = "<teiCorpus>\n</teiCorpus>"
				9	origRoot = ET.fromstring(corpus)
				10	corpusStr = minidom.parseString(ET.tostring(origRoot)).toprettyxml(indent=" ")
				11	if not os.path.exists('input'):
				12	os.makedirs("input")
				13	if not os.path.exists('output'):
				14	os.makedirs("output")
				15	with open("input/tree_structure.xml", "w") as f:
				16	f.write(corpusStr)
				17
				18	# Parse corpus tree
				19	corpusTree = ET.parse("input/tree_structure.xml")
				20	corpusRoot = corpusTree.getroot()
				21
				22	# Process documents and append to corpus tree
				23	for j in range(1, len(sys.argv)):
				24	try:
				25	currentTree = convert(j-1, sys.argv[j])
				26	currentRoot = currentTree.getroot()
				27	corpusRoot.append(currentRoot)
				28	except:
				29	print(sys.argv[j])
				30	continue
				31
				32	# Indent and save tree
				33	ET.indent(corpusTree, " ")
				34	corpusTree.write("output/corpus.p5.xml", encoding='utf-8', xml_declaration=True, method='xml', short_empty_elements=True)
				35
				36
				37	def convert(j, file):
				38	# Parse document tree and get root
				39	tree = ET.parse(file)
				40	root = tree.getroot()
				41	ET.register_namespace("", "http://www.tei-c.org/ns/1.0")
				42
				43	# Store metadata and texts in lists
				44	titles = root.find(".//*{http://www.tei-c.org/ns/1.0}bibl/{http://www.tei-c.org/ns/1.0}title")
				45	uris = root.find(".//*{http://www.tei-c.org/ns/1.0}bibl/{http://www.tei-c.org/ns/1.0}pubPlace")
				46	authors = root.find(".//*{http://www.tei-c.org/ns/1.0}bibl/{http://www.tei-c.org/ns/1.0}author")
				47	dates = root.find(".//*{http://www.tei-c.org/ns/1.0}bibl/{http://www.tei-c.org/ns/1.0}date")
				48	texts = root.findall(".//*{http://www.tei-c.org/ns/1.0}body/{http://www.tei-c.org/ns/1.0}div")
				49	domains = root.find(".//*[@type]")
				50
				51	# Count text elements
				52	number_of_texts = 0
				53	for text in root.iter("{http://www.tei-c.org/ns/1.0}text"):
				54	for body in text:
				55	for div in body:
				56	number_of_texts+=1
				57
				58	# Remove all elements from root
				59	for elem in root.findall("*"):
				60	root.remove(elem)
				61
				62	# Rename root
				63	root.tag = "teiDoc"
				64
				65	# Create target structure
				66	for i in range(number_of_texts):
				67	tei = ET.SubElement(root, "TEI")
				68	teiHeader = ET.SubElement(tei, "teiHeader")
				69	fileDesc = ET.SubElement(teiHeader, "fileDesc")
				70	titleStmt = ET.SubElement(fileDesc, "titleStmt")
				71	textSigle = ET.SubElement(titleStmt, "textSigle")
				72	textSigle.text = "NO/" + f"{j:03}" + "." + f"{i:05}"
				73	sourceDesc = ET.SubElement(fileDesc, "sourceDesc")
				74	analytic = ET.SubElement(sourceDesc, "analytic")
				75	title = ET.SubElement(titleStmt, "title")
				76	title.text = titles.text
				77	hauthor = ET.SubElement(analytic, "h.author")
				78	hauthor.text = authors.text
				79	imprint = ET.SubElement(sourceDesc, "imprint")
				80	pubDateYear = ET.SubElement(imprint, "pubDate")
				81	pubDateYear.set("type", "year")
				82	pubDateYear.text = dates.text
				83	pubDateMonth = ET.SubElement(imprint, "pubDate")
				84	pubDateMonth.set("type", "month")
				85	#pubDateMonth.text = datesPublished[i].text
				86	pubDateDay = ET.SubElement(imprint, "pubDate")
				87	pubDateDay.set("type", "day")
				88	#pubDateDay.text = datesPublished[i].text
				89	pubPlace = ET.SubElement(imprint, "pubPlace")
				90	idno = ET.SubElement(pubPlace, "idno")
				91	idno.set("type", "URI")
				92	idno.text = uris.text
				93	domain = ET.SubElement(titleStmt, "domain")
				94	domain.text = domains.get("type")
Marc Kupietz	1d9fd6b	2023-05-23 17:54:20 +0200	[diff] [blame^]	95	splitFName = file.split("/")
				96	if (splitFName[-2] in ['Persuasive', 'Blog']):
				97	domain.text = splitFName[-2]
				98	else:
				99	domain.text = splitFName[-3] + ':' + splitFName[-2]
lora-sp	2e2a21a	2023-03-30 13:54:59 +0200	[diff] [blame]	100	text = ET.SubElement(tei, "text")
				101	text.append(texts[i])
				102
				103	return tree
				104
				105
				106	main()