Blame - bunc2tei.py - lora-sp/bunc2tei

blob: 5ecff9b9229c09e6e374ec193233aa248b83b511 [file] [log] [blame]

lora-sp	6697864	2023-03-08 11:02:52 +0100	[diff] [blame]	1	import os
				2	import xml.etree.ElementTree as ET
				3
lora-sp	4201a5e	2023-03-09 16:19:57 +0100	[diff] [blame^]	4	def main():
				5	corpus = "<teiCorpus></teiCorpus>"
				6	root = ET.fromstring(corpus)
				7	path = "/home/spassova/BGCorpusExamples/"
				8	files = os.listdir(path)
				9	for j in range(len(files)):
				10	processing(path + files[j])
				11	teiDoc = ET.SubElement(root, "teiDoc")
				12
lora-sp	6697864	2023-03-08 11:02:52 +0100	[diff] [blame]	13
lora-sp	4201a5e	2023-03-09 16:19:57 +0100	[diff] [blame^]	14	def processing(file):
lora-sp	d9b0068	2023-03-09 10:18:20 +0100	[diff] [blame]	15	# Parse tree and get root
lora-sp	4201a5e	2023-03-09 16:19:57 +0100	[diff] [blame^]	16	tree = ET.parse(file)
				17	root = tree.getroot()
lora-sp	d9b0068	2023-03-09 10:18:20 +0100	[diff] [blame]	18	# Store metadata and texts in lists
lora-sp	4201a5e	2023-03-09 16:19:57 +0100	[diff] [blame^]	19	titles = root.findall(".//*[@type='title']")
				20	domains = root.findall(".//*[@type='domain']")
				21	pageURLs = root.findall(".//*[@type='pageURL']")
				22	ids = root.findall(".//*[@type='id']")
				23	mainImageURLs = root.findall(".//*[@type='mainImageURL']")
				24	mainImageTexts = root.findall(".//*[@type='mainImageTexts']")
				25	mainImageSources = root.findall(".//*[@type='mainImageSources']")
				26	authors = root.findall(".//*[@type='authors']")
				27	authorURLs = root.findall(".//*[@type='authorURLs']")
				28	categories = root.findall(".//*[@type='category']")
				29	subCategories = root.findall(".//*[@type='subCategory']")
				30	tags = root.findall(".//*[@type='tags']")
				31	datesPublished = root.findall(".//*[@type='datePublished']")
				32	timesPublished = root.findall(".//*[@type='timePublished']")
				33	datesModified = root.findall(".//*[@type='dateModified']")
				34	timesModified = root.findall(".//*[@type='timeModified']")
				35	mainImageWidths = root.findall(".//*[@type='mainImageWidth']")
				36	mainImageHeights = root.findall(".//*[@type='mainImageHeight']")
				37	mainImageThumbnailURLs = root.findall(".//*[@type='mainImageThumbnailURL']")
				38	texts = []
lora-sp	d9b0068	2023-03-09 10:18:20 +0100	[diff] [blame]	39	# Count text elements and remove metadata
lora-sp	4201a5e	2023-03-09 16:19:57 +0100	[diff] [blame^]	40	number_of_texts = 0
				41	for text in root.iter("{http://www.tei-c.org/ns/1.0}text"):
				42	number_of_texts+=1
				43	for body in text:
				44	for div1 in body:
				45	for div2 in div1:
				46	for div3 in div2:
				47	if div3.get('type') == "metadata":
				48	div2.remove(div3)
lora-sp	8251163	2023-03-09 09:12:17 +0100	[diff] [blame]	49
lora-sp	4201a5e	2023-03-09 16:19:57 +0100	[diff] [blame^]	50	texts.append(div2)
lora-sp	d9b0068	2023-03-09 10:18:20 +0100	[diff] [blame]	51	# Remove all elements from root
lora-sp	4201a5e	2023-03-09 16:19:57 +0100	[diff] [blame^]	52	for elem in root.findall("*"):
				53	root.remove(elem)
lora-sp	d9b0068	2023-03-09 10:18:20 +0100	[diff] [blame]	54	# Rename root
lora-sp	4201a5e	2023-03-09 16:19:57 +0100	[diff] [blame^]	55	root.tag = "teiDoc"
lora-sp	d9b0068	2023-03-09 10:18:20 +0100	[diff] [blame]	56	# Create i5 structure
lora-sp	4201a5e	2023-03-09 16:19:57 +0100	[diff] [blame^]	57	for i in range(number_of_texts):
				58	tei = ET.SubElement(root, "TEI")
				59	teiHeader = ET.SubElement(tei, "teiHeader")
				60	fileDesc = ET.SubElement(teiHeader, "fileDesc")
				61	titleStmt = ET.SubElement(fileDesc, "titleStmt")
				62	textSigle = ET.SubElement(titleStmt, "textSigle")
				63	textSigle.text = "BNC/" + f"{j:02}" + "." + f"{i:05}"
				64	sourceDesc = ET.SubElement(fileDesc, "sourceDesc")
				65	analytic = ET.SubElement(sourceDesc, "analytic")
				66	htitle = ET.SubElement(analytic, "h.title")
				67	htitle.text = titles[i].text
				68	hauthor = ET.SubElement(analytic, "h.author")
				69	hauthor.text = authors[i].text
				70	imprint = ET.SubElement(sourceDesc, "imprint")
				71	pubDateYear = ET.SubElement(imprint, "pubDate")
				72	pubDateYear.set("type", "year")
				73	pubDateYear.text = datesPublished[i].text[0:4]
				74	pubDateMonth = ET.SubElement(imprint, "pubDate")
				75	pubDateMonth.set("type", "month")
				76	pubDateMonth.text = datesPublished[i].text[5:7]
				77	pubDateDay = ET.SubElement(imprint, "pubDate")
				78	pubDateDay.set("type", "day")
				79	pubDateDay.text = datesPublished[i].text[8:10]
				80	pubDateTime = ET.SubElement(imprint, "pubDate")
				81	pubDateTime.set("type", "time")
				82	pubDateTime.text = timesPublished[i].text
				83	pubPlace = ET.SubElement(imprint, "pubPlace")
				84	ref = ET.SubElement(pubPlace, "ref")
				85	ref.set("type", "page_url")
				86	ref.set("target", pageURLs[i].text)
				87	text = ET.SubElement(tei, "text")
				88	body = ET.SubElement(text, "body")
				89	for p in texts[i]:
				90	body.append(p)
lora-sp	8251163	2023-03-09 09:12:17 +0100	[diff] [blame]	91
lora-sp	4201a5e	2023-03-09 16:19:57 +0100	[diff] [blame^]	92	ET.indent(tree, " ")
				93	ET.register_namespace("", "http://www.tei-c.org/ns/1.0")
				94	tree.write(f"{j:02}" + "_" + "output.xml", encoding="utf-8", xml_declaration=True, method="xml", short_empty_elements=True)