Blame - irish2tei.py - ICC/irish2korap

blob: cd468a648238bd9b2b068d0686a7e2436a52f3e2 [file] [log] [blame]

Rameela Yaddehige	270f7cc	2023-07-14 14:12:08 +0200	[diff] [blame]	1	import os, sys
				2	import xml.etree.ElementTree as ET
				3	from xml.dom import minidom
				4	import traceback
				5	import sys
				6
				7	def main():
				8	# Create corpus structure from string and save into file
				9	corpus = "<teiCorpus>\n</teiCorpus>"
				10	origRoot = ET.fromstring(corpus)
				11	corpusStr = minidom.parseString(ET.tostring(origRoot)).toprettyxml(indent=" ")
				12	if not os.path.exists('input'):
				13	os.makedirs("input")
				14	if not os.path.exists('output'):
				15	os.makedirs("output")
				16	with open("input/tree_structure.xml", "w") as f:
				17	f.write(corpusStr)
				18
				19	# Parse corpus tree
				20	corpusTree = ET.parse("input/tree_structure.xml")
				21	corpusRoot = corpusTree.getroot()
				22
				23	# Process documents and append to corpus tree
				24	for j in range(1, len(sys.argv)):
				25	try:
				26	currentTree = convert(j-1, sys.argv[j])
				27	currentRoot = currentTree.getroot()
				28	corpusRoot.append(currentRoot)
				29	except:
				30	print("ERROR:"+sys.argv[j])
				31	print(traceback.format_exc())
				32	print(sys.exc_info()[2])
				33	continue
				34
				35	# Indent and save tree
				36	ET.indent(corpusTree, " ")
Rameela Yaddehige	6c0ff87	2023-07-14 14:46:16 +0200	[diff] [blame]	37	corpusTree.write("output/GA_corpus.p5.xml", encoding='utf-8', xml_declaration=True, method='xml', short_empty_elements=True)
Rameela Yaddehige	270f7cc	2023-07-14 14:12:08 +0200	[diff] [blame]	38
				39
				40	def convert(j, file):
				41	# Parse document tree and get root
				42	tree = ET.parse(file)
				43	root = tree.getroot()
				44	ET.register_namespace("", "http://www.tei-c.org/ns/1.0")
				45
				46	# Store metadata and texts in lists
				47	titles = root.find(".//*{http://www.tei-c.org/ns/1.0}bibl/{http://www.tei-c.org/ns/1.0}title")
				48	uris = root.find(".//*{http://www.tei-c.org/ns/1.0}bibl/{http://www.tei-c.org/ns/1.0}pubPlace")
Rameela Yaddehige	46ebd4a	2023-08-09 12:33:21 +0200	[diff] [blame]	49	authors = root.findall(".//*{http://www.tei-c.org/ns/1.0}bibl/{http://www.tei-c.org/ns/1.0}author")
Rameela Yaddehige	2f996c2	2023-08-09 15:04:27 +0200	[diff] [blame^]	50	txtPublisher = root.find(".//*{http://www.tei-c.org/ns/1.0}bibl/{http://www.tei-c.org/ns/1.0}publisher")
				51	licenses = root.findall(".//*{http://www.tei-c.org/ns/1.0}availability/{http://www.tei-c.org/ns/1.0}licence")
Rameela Yaddehige	46ebd4a	2023-08-09 12:33:21 +0200	[diff] [blame]	52
Rameela Yaddehige	270f7cc	2023-07-14 14:12:08 +0200	[diff] [blame]	53	dates = root.find(".//*{http://www.tei-c.org/ns/1.0}bibl/{http://www.tei-c.org/ns/1.0}date")
				54	texts = root.findall(".//*{http://www.tei-c.org/ns/1.0}body/{http://www.tei-c.org/ns/1.0}div")
				55	domains = root.find(".//*[@type]")
				56
				57	# Count text elements
				58	number_of_texts = 0
				59	for text in root.iter("{http://www.tei-c.org/ns/1.0}text"):
				60	for body in text:
				61	for div in body:
				62	number_of_texts+=1
				63
				64	number_of_texts = len(texts)
				65	# Remove all elements from root
				66	for elem in root.findall("*"):
				67	root.remove(elem)
				68
				69	# Rename root
				70	root.tag = "teiDoc"
				71
				72	# Create target structure
				73	for i in range(number_of_texts):
				74	tei = ET.SubElement(root, "TEI")
				75	teiHeader = ET.SubElement(tei, "teiHeader")
				76	fileDesc = ET.SubElement(teiHeader, "fileDesc")
				77	titleStmt = ET.SubElement(fileDesc, "titleStmt")
				78	textSigle = ET.SubElement(titleStmt, "textSigle")
				79	textSigle.text = "GA/" + f"{j:03}" + "." + f"{i:05}"
				80	sourceDesc = ET.SubElement(fileDesc, "sourceDesc")
				81	analytic = ET.SubElement(sourceDesc, "analytic")
				82	title = ET.SubElement(titleStmt, "title")
				83	title.text = titles.text
Rameela Yaddehige	46ebd4a	2023-08-09 12:33:21 +0200	[diff] [blame]	84
Rameela Yaddehige	2f996c2	2023-08-09 15:04:27 +0200	[diff] [blame^]	85	print(textSigle.text, end='\n')
Rameela Yaddehige	46ebd4a	2023-08-09 12:33:21 +0200	[diff] [blame]	86
Rameela Yaddehige	17a1191	2023-08-09 12:44:38 +0200	[diff] [blame]	87	#adding changes to detect multiple authors
Rameela Yaddehige	46ebd4a	2023-08-09 12:33:21 +0200	[diff] [blame]	88	for author in authors :
				89	hauthor = ET.SubElement(analytic, "h.author")
				90	hauthor.text = author.text
Rameela Yaddehige	17a1191	2023-08-09 12:44:38 +0200	[diff] [blame]	91	#print(hauthor.text, end='\n')
Rameela Yaddehige	270f7cc	2023-07-14 14:12:08 +0200	[diff] [blame]	92	imprint = ET.SubElement(sourceDesc, "imprint")
Rameela Yaddehige	2f996c2	2023-08-09 15:04:27 +0200	[diff] [blame^]	93
				94	#adding publisher
				95
				96	print('Publisher : ',txtPublisher.text, end='\n')
				97	if txtPublisher is None:
				98	print('Publisher : ',txtPublisher.text, end='\n')
				99	else :
				100	publisher = ET.SubElement(imprint, "publisher")
				101	publisher.text = txtPublisher.text
				102
				103	#adding licenses
				104	for license in licenses:
				105	print('License : ',license.text,end='\n')
				106
Rameela Yaddehige	270f7cc	2023-07-14 14:12:08 +0200	[diff] [blame]	107	pubDateYear = ET.SubElement(imprint, "pubDate")
				108	pubDateYear.set("type", "year")
				109	pubDateYear.text = dates.text
				110	pubDateMonth = ET.SubElement(imprint, "pubDate")
				111	pubDateMonth.set("type", "month")
				112	#pubDateMonth.text = datesPublished[i].text
				113	pubDateDay = ET.SubElement(imprint, "pubDate")
				114	pubDateDay.set("type", "day")
				115	#pubDateDay.text = datesPublished[i].text
				116	pubPlace = ET.SubElement(imprint, "pubPlace")
				117	idno = ET.SubElement(pubPlace, "idno")
				118	idno.set("type", "URI")
				119	idno.text = uris.text
				120	domain = ET.SubElement(titleStmt, "domain")
				121	domain.text = domains.get("type")
Rameela Yaddehige	2a20eac	2023-07-14 17:10:32 +0200	[diff] [blame]	122	'''
Rameela Yaddehige	270f7cc	2023-07-14 14:12:08 +0200	[diff] [blame]	123	splitFName = file.split("/")
				124	if (splitFName[-2] in ['Persuasive', 'Blog']):
				125	domain.text = splitFName[-2]
				126	elif(splitFName[-4] != "Originaldaten"):
				127	domain.text = splitFName[-4] + ':' + splitFName[-3] + ':' + splitFName[-2]
				128	else:
				129	domain.text = splitFName[-3] + ':' + splitFName[-2]
Rameela Yaddehige	2a20eac	2023-07-14 17:10:32 +0200	[diff] [blame]	130	'''
Rameela Yaddehige	270f7cc	2023-07-14 14:12:08 +0200	[diff] [blame]	131	domain.text = domain.text.replace("_hobbies", "_Hobbies")
				132	text = ET.SubElement(tei, "text")
Rameela Yaddehige	46ebd4a	2023-08-09 12:33:21 +0200	[diff] [blame]	133	text.append(texts[i])
				134
Rameela Yaddehige	270f7cc	2023-07-14 14:12:08 +0200	[diff] [blame]	135
				136	return tree
				137
				138
				139	main()