Blame - irish2tei.py - ICC/irish2korap

blob: 84c908c40828a4a302cc366c0842f03ba357ec8c [file] [log] [blame]

Rameela Yaddehige	270f7cc	2023-07-14 14:12:08 +0200	[diff] [blame]	1	import os, sys
				2	import xml.etree.ElementTree as ET
				3	from xml.dom import minidom
				4	import traceback
				5	import sys
				6
				7	def main():
				8	# Create corpus structure from string and save into file
				9	corpus = "<teiCorpus>\n</teiCorpus>"
				10	origRoot = ET.fromstring(corpus)
				11	corpusStr = minidom.parseString(ET.tostring(origRoot)).toprettyxml(indent=" ")
				12	if not os.path.exists('input'):
				13	os.makedirs("input")
				14	if not os.path.exists('output'):
				15	os.makedirs("output")
				16	with open("input/tree_structure.xml", "w") as f:
				17	f.write(corpusStr)
				18
				19	# Parse corpus tree
				20	corpusTree = ET.parse("input/tree_structure.xml")
				21	corpusRoot = corpusTree.getroot()
				22
				23	# Process documents and append to corpus tree
				24	for j in range(1, len(sys.argv)):
				25	try:
				26	currentTree = convert(j-1, sys.argv[j])
				27	currentRoot = currentTree.getroot()
				28	corpusRoot.append(currentRoot)
				29	except:
				30	print("ERROR:"+sys.argv[j])
				31	print(traceback.format_exc())
				32	print(sys.exc_info()[2])
				33	continue
				34
				35	# Indent and save tree
				36	ET.indent(corpusTree, " ")
Rameela Yaddehige	6c0ff87	2023-07-14 14:46:16 +0200	[diff] [blame]	37	corpusTree.write("output/GA_corpus.p5.xml", encoding='utf-8', xml_declaration=True, method='xml', short_empty_elements=True)
Rameela Yaddehige	270f7cc	2023-07-14 14:12:08 +0200	[diff] [blame]	38
				39
				40	def convert(j, file):
				41	# Parse document tree and get root
				42	tree = ET.parse(file)
				43	root = tree.getroot()
				44	ET.register_namespace("", "http://www.tei-c.org/ns/1.0")
				45
				46	# Store metadata and texts in lists
				47	titles = root.find(".//*{http://www.tei-c.org/ns/1.0}bibl/{http://www.tei-c.org/ns/1.0}title")
				48	uris = root.find(".//*{http://www.tei-c.org/ns/1.0}bibl/{http://www.tei-c.org/ns/1.0}pubPlace")
Rameela Yaddehige	46ebd4a	2023-08-09 12:33:21 +0200	[diff] [blame^]	49	authors = root.findall(".//*{http://www.tei-c.org/ns/1.0}bibl/{http://www.tei-c.org/ns/1.0}author")
				50
Rameela Yaddehige	270f7cc	2023-07-14 14:12:08 +0200	[diff] [blame]	51	dates = root.find(".//*{http://www.tei-c.org/ns/1.0}bibl/{http://www.tei-c.org/ns/1.0}date")
				52	texts = root.findall(".//*{http://www.tei-c.org/ns/1.0}body/{http://www.tei-c.org/ns/1.0}div")
				53	domains = root.find(".//*[@type]")
				54
				55	# Count text elements
				56	number_of_texts = 0
				57	for text in root.iter("{http://www.tei-c.org/ns/1.0}text"):
				58	for body in text:
				59	for div in body:
				60	number_of_texts+=1
				61
				62	number_of_texts = len(texts)
				63	# Remove all elements from root
				64	for elem in root.findall("*"):
				65	root.remove(elem)
				66
				67	# Rename root
				68	root.tag = "teiDoc"
				69
				70	# Create target structure
				71	for i in range(number_of_texts):
				72	tei = ET.SubElement(root, "TEI")
				73	teiHeader = ET.SubElement(tei, "teiHeader")
				74	fileDesc = ET.SubElement(teiHeader, "fileDesc")
				75	titleStmt = ET.SubElement(fileDesc, "titleStmt")
				76	textSigle = ET.SubElement(titleStmt, "textSigle")
				77	textSigle.text = "GA/" + f"{j:03}" + "." + f"{i:05}"
				78	sourceDesc = ET.SubElement(fileDesc, "sourceDesc")
				79	analytic = ET.SubElement(sourceDesc, "analytic")
				80	title = ET.SubElement(titleStmt, "title")
				81	title.text = titles.text
Rameela Yaddehige	46ebd4a	2023-08-09 12:33:21 +0200	[diff] [blame^]	82
				83	print(textSigle.text, end='\n')
				84
				85
				86	for author in authors :
				87	hauthor = ET.SubElement(analytic, "h.author")
				88	hauthor.text = author.text
				89	print(hauthor.text, end='\n')
Rameela Yaddehige	270f7cc	2023-07-14 14:12:08 +0200	[diff] [blame]	90	imprint = ET.SubElement(sourceDesc, "imprint")
				91	pubDateYear = ET.SubElement(imprint, "pubDate")
				92	pubDateYear.set("type", "year")
				93	pubDateYear.text = dates.text
				94	pubDateMonth = ET.SubElement(imprint, "pubDate")
				95	pubDateMonth.set("type", "month")
				96	#pubDateMonth.text = datesPublished[i].text
				97	pubDateDay = ET.SubElement(imprint, "pubDate")
				98	pubDateDay.set("type", "day")
				99	#pubDateDay.text = datesPublished[i].text
				100	pubPlace = ET.SubElement(imprint, "pubPlace")
				101	idno = ET.SubElement(pubPlace, "idno")
				102	idno.set("type", "URI")
				103	idno.text = uris.text
				104	domain = ET.SubElement(titleStmt, "domain")
				105	domain.text = domains.get("type")
Rameela Yaddehige	2a20eac	2023-07-14 17:10:32 +0200	[diff] [blame]	106	'''
Rameela Yaddehige	270f7cc	2023-07-14 14:12:08 +0200	[diff] [blame]	107	splitFName = file.split("/")
				108	if (splitFName[-2] in ['Persuasive', 'Blog']):
				109	domain.text = splitFName[-2]
				110	elif(splitFName[-4] != "Originaldaten"):
				111	domain.text = splitFName[-4] + ':' + splitFName[-3] + ':' + splitFName[-2]
				112	else:
				113	domain.text = splitFName[-3] + ':' + splitFName[-2]
Rameela Yaddehige	2a20eac	2023-07-14 17:10:32 +0200	[diff] [blame]	114	'''
Rameela Yaddehige	270f7cc	2023-07-14 14:12:08 +0200	[diff] [blame]	115	domain.text = domain.text.replace("_hobbies", "_Hobbies")
				116	text = ET.SubElement(tei, "text")
Rameela Yaddehige	46ebd4a	2023-08-09 12:33:21 +0200	[diff] [blame^]	117	text.append(texts[i])
				118
Rameela Yaddehige	270f7cc	2023-07-14 14:12:08 +0200	[diff] [blame]	119
				120	return tree
				121
				122
				123	main()