Blame - nor2tei.py - ICC/nor2korap

blob: 5c67bdc991c2ad483d85b123936ef3c50c227eba [file] [log] [blame]

lora-sp	2e2a21a	2023-03-30 13:54:59 +0200	[diff] [blame]	1	import os, sys
				2	import xml.etree.ElementTree as ET
				3	from xml.dom import minidom
Marc Kupietz	17269e4	2023-05-25 10:59:01 +0200	[diff] [blame^]	4	import traceback
				5	import sys
lora-sp	2e2a21a	2023-03-30 13:54:59 +0200	[diff] [blame]	6
				7	def main():
				8	# Create corpus structure from string and save into file
				9	corpus = "<teiCorpus>\n</teiCorpus>"
				10	origRoot = ET.fromstring(corpus)
				11	corpusStr = minidom.parseString(ET.tostring(origRoot)).toprettyxml(indent=" ")
				12	if not os.path.exists('input'):
				13	os.makedirs("input")
				14	if not os.path.exists('output'):
				15	os.makedirs("output")
				16	with open("input/tree_structure.xml", "w") as f:
				17	f.write(corpusStr)
				18
				19	# Parse corpus tree
				20	corpusTree = ET.parse("input/tree_structure.xml")
				21	corpusRoot = corpusTree.getroot()
				22
				23	# Process documents and append to corpus tree
				24	for j in range(1, len(sys.argv)):
Marc Kupietz	17269e4	2023-05-25 10:59:01 +0200	[diff] [blame^]	25	try:
				26	currentTree = convert(j-1, sys.argv[j])
				27	currentRoot = currentTree.getroot()
				28	corpusRoot.append(currentRoot)
				29	except:
				30	print("ERROR:"+sys.argv[j])
				31	print(traceback.format_exc())
				32	print(sys.exc_info()[2])
				33	continue
lora-sp	2e2a21a	2023-03-30 13:54:59 +0200	[diff] [blame]	34
				35	# Indent and save tree
				36	ET.indent(corpusTree, " ")
				37	corpusTree.write("output/corpus.p5.xml", encoding='utf-8', xml_declaration=True, method='xml', short_empty_elements=True)
				38
				39
				40	def convert(j, file):
				41	# Parse document tree and get root
				42	tree = ET.parse(file)
				43	root = tree.getroot()
				44	ET.register_namespace("", "http://www.tei-c.org/ns/1.0")
				45
				46	# Store metadata and texts in lists
				47	titles = root.find(".//*{http://www.tei-c.org/ns/1.0}bibl/{http://www.tei-c.org/ns/1.0}title")
				48	uris = root.find(".//*{http://www.tei-c.org/ns/1.0}bibl/{http://www.tei-c.org/ns/1.0}pubPlace")
				49	authors = root.find(".//*{http://www.tei-c.org/ns/1.0}bibl/{http://www.tei-c.org/ns/1.0}author")
				50	dates = root.find(".//*{http://www.tei-c.org/ns/1.0}bibl/{http://www.tei-c.org/ns/1.0}date")
				51	texts = root.findall(".//*{http://www.tei-c.org/ns/1.0}body/{http://www.tei-c.org/ns/1.0}div")
				52	domains = root.find(".//*[@type]")
				53
				54	# Count text elements
				55	number_of_texts = 0
				56	for text in root.iter("{http://www.tei-c.org/ns/1.0}text"):
				57	for body in text:
				58	for div in body:
				59	number_of_texts+=1
				60
				61	# Remove all elements from root
				62	for elem in root.findall("*"):
				63	root.remove(elem)
				64
				65	# Rename root
				66	root.tag = "teiDoc"
				67
				68	# Create target structure
				69	for i in range(number_of_texts):
				70	tei = ET.SubElement(root, "TEI")
				71	teiHeader = ET.SubElement(tei, "teiHeader")
				72	fileDesc = ET.SubElement(teiHeader, "fileDesc")
				73	titleStmt = ET.SubElement(fileDesc, "titleStmt")
				74	textSigle = ET.SubElement(titleStmt, "textSigle")
Marc Kupietz	386c8af	2023-05-23 17:56:42 +0200	[diff] [blame]	75	textSigle.text = "NOR/" + f"{j:03}" + "." + f"{i:05}"
lora-sp	2e2a21a	2023-03-30 13:54:59 +0200	[diff] [blame]	76	sourceDesc = ET.SubElement(fileDesc, "sourceDesc")
				77	analytic = ET.SubElement(sourceDesc, "analytic")
				78	title = ET.SubElement(titleStmt, "title")
				79	title.text = titles.text
				80	hauthor = ET.SubElement(analytic, "h.author")
				81	hauthor.text = authors.text
				82	imprint = ET.SubElement(sourceDesc, "imprint")
				83	pubDateYear = ET.SubElement(imprint, "pubDate")
				84	pubDateYear.set("type", "year")
				85	pubDateYear.text = dates.text
				86	pubDateMonth = ET.SubElement(imprint, "pubDate")
				87	pubDateMonth.set("type", "month")
				88	#pubDateMonth.text = datesPublished[i].text
				89	pubDateDay = ET.SubElement(imprint, "pubDate")
				90	pubDateDay.set("type", "day")
				91	#pubDateDay.text = datesPublished[i].text
				92	pubPlace = ET.SubElement(imprint, "pubPlace")
				93	idno = ET.SubElement(pubPlace, "idno")
				94	idno.set("type", "URI")
				95	idno.text = uris.text
				96	domain = ET.SubElement(titleStmt, "domain")
				97	domain.text = domains.get("type")
Marc Kupietz	1d9fd6b	2023-05-23 17:54:20 +0200	[diff] [blame]	98	splitFName = file.split("/")
				99	if (splitFName[-2] in ['Persuasive', 'Blog']):
				100	domain.text = splitFName[-2]
Marc Kupietz	7f9a493	2023-05-25 10:58:15 +0200	[diff] [blame]	101	elif(splitFName[-4] != "Originaldaten"):
				102	domain.text = splitFName[-4] + ':' + splitFName[-3] + ':' + splitFName[-2]
Marc Kupietz	1d9fd6b	2023-05-23 17:54:20 +0200	[diff] [blame]	103	else:
				104	domain.text = splitFName[-3] + ':' + splitFName[-2]
Marc Kupietz	7f9a493	2023-05-25 10:58:15 +0200	[diff] [blame]	105	domain.text = domain.text.replace("_hobbies", "_Hobbies")
lora-sp	2e2a21a	2023-03-30 13:54:59 +0200	[diff] [blame]	106	text = ET.SubElement(tei, "text")
				107	text.append(texts[i])
				108
				109	return tree
				110
				111
				112	main()