Blame - bunc2tei.py - lora-sp/bunc2tei

blob: d143470c4440f9d2f85b7eb229fc78ce00bb8fa0 [file] [log] [blame]

lora-sp	ab4e0ea	2023-03-10 12:02:24 +0100	[diff] [blame^]	1	import os, sys
lora-sp	6697864	2023-03-08 11:02:52 +0100	[diff] [blame]	2	import xml.etree.ElementTree as ET
lora-sp	ab4e0ea	2023-03-10 12:02:24 +0100	[diff] [blame^]	3	from xml.dom import minidom
				4
lora-sp	6697864	2023-03-08 11:02:52 +0100	[diff] [blame]	5
lora-sp	4201a5e	2023-03-09 16:19:57 +0100	[diff] [blame]	6	def main():
lora-sp	ab4e0ea	2023-03-10 12:02:24 +0100	[diff] [blame^]	7	# Create corpus structure from string and save into file
				8	corpus = "<teiCorpus>\n</teiCorpus>"
				9	origRoot = ET.fromstring(corpus)
				10	corpusStr = minidom.parseString(ET.tostring(origRoot)).toprettyxml(indent=" ")
				11	with open("tree_structure.xml", "w") as f:
				12	f.write(corpusStr)
lora-sp	132c3e5	2023-03-09 16:32:37 +0100	[diff] [blame]	13
lora-sp	ab4e0ea	2023-03-10 12:02:24 +0100	[diff] [blame^]	14	# Process all documents and append to corpusTree
				15	#path = "./BGCorpusExamples/"
				16	#files = os.listdir(path)
				17	process(0, sys.argv[1])
				18	#process(sys.argv[2])
				19	# Parse corpus tree, indent and output
				20	corpusTree = ET.parse("tree_structure.xml")
lora-sp	132c3e5	2023-03-09 16:32:37 +0100	[diff] [blame]	21	ET.indent(corpusTree, " ")
lora-sp	ab4e0ea	2023-03-10 12:02:24 +0100	[diff] [blame^]	22	corpusTree.write("output.p5.xml", encoding='utf-8', xml_declaration=True, method='xml', short_empty_elements=True)
lora-sp	6697864	2023-03-08 11:02:52 +0100	[diff] [blame]	23
lora-sp	ab4e0ea	2023-03-10 12:02:24 +0100	[diff] [blame^]	24
				25	def process(j, file):
				26	#j = 0
				27	# Parse corpus tree and get corpus root
				28	corpusTree = ET.parse("tree_structure.xml")
				29	corpusRoot = corpusTree.getroot()
				30
				31	# Parse document tree and get root
lora-sp	4201a5e	2023-03-09 16:19:57 +0100	[diff] [blame]	32	tree = ET.parse(file)
				33	root = tree.getroot()
lora-sp	ab4e0ea	2023-03-10 12:02:24 +0100	[diff] [blame^]	34
				35	# Store metadata and texts in lists
lora-sp	4201a5e	2023-03-09 16:19:57 +0100	[diff] [blame]	36	titles = root.findall(".//*[@type='title']")
lora-sp	ab4e0ea	2023-03-10 12:02:24 +0100	[diff] [blame^]	37	#domains = root.findall(".//*[@type='domain']")
lora-sp	4201a5e	2023-03-09 16:19:57 +0100	[diff] [blame]	38	pageURLs = root.findall(".//*[@type='pageURL']")
lora-sp	ab4e0ea	2023-03-10 12:02:24 +0100	[diff] [blame^]	39	#ids = root.findall(".//*[@type='id']")
				40	#mainImageURLs = root.findall(".//*[@type='mainImageURL']")
				41	#mainImageTexts = root.findall(".//*[@type='mainImageTexts']")
				42	#mainImageSources = root.findall(".//*[@type='mainImageSources']")
lora-sp	4201a5e	2023-03-09 16:19:57 +0100	[diff] [blame]	43	authors = root.findall(".//*[@type='authors']")
lora-sp	ab4e0ea	2023-03-10 12:02:24 +0100	[diff] [blame^]	44	#authorURLs = root.findall(".//*[@type='authorURLs']")
				45	#categories = root.findall(".//*[@type='category']")
				46	#subCategories = root.findall(".//*[@type='subCategory']")
				47	#tags = root.findall(".//*[@type='tags']")
lora-sp	4201a5e	2023-03-09 16:19:57 +0100	[diff] [blame]	48	datesPublished = root.findall(".//*[@type='datePublished']")
				49	timesPublished = root.findall(".//*[@type='timePublished']")
lora-sp	ab4e0ea	2023-03-10 12:02:24 +0100	[diff] [blame^]	50	#datesModified = root.findall(".//*[@type='dateModified']")
				51	#timesModified = root.findall(".//*[@type='timeModified']")
				52	#mainImageWidths = root.findall(".//*[@type='mainImageWidth']")
				53	#mainImageHeights = root.findall(".//*[@type='mainImageHeight']")
				54	#mainImageThumbnailURLs = root.findall(".//*[@type='mainImageThumbnailURL']")
lora-sp	4201a5e	2023-03-09 16:19:57 +0100	[diff] [blame]	55	texts = []
lora-sp	ab4e0ea	2023-03-10 12:02:24 +0100	[diff] [blame^]	56
				57	# Count text elements and remove metadata
lora-sp	4201a5e	2023-03-09 16:19:57 +0100	[diff] [blame]	58	number_of_texts = 0
				59	for text in root.iter("{http://www.tei-c.org/ns/1.0}text"):
				60	number_of_texts+=1
				61	for body in text:
				62	for div1 in body:
				63	for div2 in div1:
				64	for div3 in div2:
				65	if div3.get('type') == "metadata":
				66	div2.remove(div3)
lora-sp	8251163	2023-03-09 09:12:17 +0100	[diff] [blame]	67
lora-sp	4201a5e	2023-03-09 16:19:57 +0100	[diff] [blame]	68	texts.append(div2)
lora-sp	ab4e0ea	2023-03-10 12:02:24 +0100	[diff] [blame^]	69
				70	# Remove all elements from root
lora-sp	4201a5e	2023-03-09 16:19:57 +0100	[diff] [blame]	71	for elem in root.findall("*"):
				72	root.remove(elem)
lora-sp	ab4e0ea	2023-03-10 12:02:24 +0100	[diff] [blame^]	73
				74	# Rename root
lora-sp	4201a5e	2023-03-09 16:19:57 +0100	[diff] [blame]	75	root.tag = "teiDoc"
lora-sp	ab4e0ea	2023-03-10 12:02:24 +0100	[diff] [blame^]	76
				77	# Create target structure
lora-sp	4201a5e	2023-03-09 16:19:57 +0100	[diff] [blame]	78	for i in range(number_of_texts):
				79	tei = ET.SubElement(root, "TEI")
				80	teiHeader = ET.SubElement(tei, "teiHeader")
				81	fileDesc = ET.SubElement(teiHeader, "fileDesc")
				82	titleStmt = ET.SubElement(fileDesc, "titleStmt")
				83	textSigle = ET.SubElement(titleStmt, "textSigle")
lora-sp	ab4e0ea	2023-03-10 12:02:24 +0100	[diff] [blame^]	84	textSigle.text = "BNC/" + f"{j:03}" + "." + f"{i:05}"
lora-sp	4201a5e	2023-03-09 16:19:57 +0100	[diff] [blame]	85	sourceDesc = ET.SubElement(fileDesc, "sourceDesc")
				86	analytic = ET.SubElement(sourceDesc, "analytic")
				87	htitle = ET.SubElement(analytic, "h.title")
				88	htitle.text = titles[i].text
				89	hauthor = ET.SubElement(analytic, "h.author")
				90	hauthor.text = authors[i].text
				91	imprint = ET.SubElement(sourceDesc, "imprint")
				92	pubDateYear = ET.SubElement(imprint, "pubDate")
				93	pubDateYear.set("type", "year")
				94	pubDateYear.text = datesPublished[i].text[0:4]
				95	pubDateMonth = ET.SubElement(imprint, "pubDate")
				96	pubDateMonth.set("type", "month")
				97	pubDateMonth.text = datesPublished[i].text[5:7]
				98	pubDateDay = ET.SubElement(imprint, "pubDate")
				99	pubDateDay.set("type", "day")
				100	pubDateDay.text = datesPublished[i].text[8:10]
				101	pubDateTime = ET.SubElement(imprint, "pubDate")
				102	pubDateTime.set("type", "time")
				103	pubDateTime.text = timesPublished[i].text
				104	pubPlace = ET.SubElement(imprint, "pubPlace")
				105	ref = ET.SubElement(pubPlace, "ref")
				106	ref.set("type", "page_url")
				107	ref.set("target", pageURLs[i].text)
				108	text = ET.SubElement(tei, "text")
				109	body = ET.SubElement(text, "body")
				110	for p in texts[i]:
				111	body.append(p)
lora-sp	8251163	2023-03-09 09:12:17 +0100	[diff] [blame]	112
lora-sp	132c3e5	2023-03-09 16:32:37 +0100	[diff] [blame]	113
				114	corpusRoot.append(root)
lora-sp	4201a5e	2023-03-09 16:19:57 +0100	[diff] [blame]	115	ET.register_namespace("", "http://www.tei-c.org/ns/1.0")
lora-sp	132c3e5	2023-03-09 16:32:37 +0100	[diff] [blame]	116
lora-sp	132c3e5	2023-03-09 16:32:37 +0100	[diff] [blame]	117
lora-sp	ab4e0ea	2023-03-10 12:02:24 +0100	[diff] [blame^]	118	if __name__ == "__main__":
				119	main()