Blame - bunc2tei.py - lora-sp/bunc2tei

blob: bf5bf4f70ed277c9e91b810613cca736b8608a09 [file] [log] [blame]

lora-sp	6697864	2023-03-08 11:02:52 +0100	[diff] [blame]	1	import os
				2	import xml.etree.ElementTree as ET
				3
lora-sp	4201a5e	2023-03-09 16:19:57 +0100	[diff] [blame]	4	def main():
				5	corpus = "<teiCorpus></teiCorpus>"
lora-sp	132c3e5	2023-03-09 16:32:37 +0100	[diff] [blame]	6	corpusTree = ET.parse(corpus)
				7	corpusRoot = ET.fromstring(corpus)
lora-sp	4201a5e	2023-03-09 16:19:57 +0100	[diff] [blame]	8	path = "/home/spassova/BGCorpusExamples/"
				9	files = os.listdir(path)
				10	for j in range(len(files)):
lora-sp	132c3e5	2023-03-09 16:32:37 +0100	[diff] [blame]	11	processing(path + files[j])
				12
				13	ET.indent(corpusTree, " ")
				14	corpusTree.write(f"{j:02}" + "_" + "output.xml", encoding="utf-8", xml_declaration=True, method="xml", short_empty_elements=True)
lora-sp	4201a5e	2023-03-09 16:19:57 +0100	[diff] [blame]	15
lora-sp	6697864	2023-03-08 11:02:52 +0100	[diff] [blame]	16
lora-sp	4201a5e	2023-03-09 16:19:57 +0100	[diff] [blame]	17	def processing(file):
lora-sp	d9b0068	2023-03-09 10:18:20 +0100	[diff] [blame]	18	# Parse tree and get root
lora-sp	4201a5e	2023-03-09 16:19:57 +0100	[diff] [blame]	19	tree = ET.parse(file)
				20	root = tree.getroot()
lora-sp	d9b0068	2023-03-09 10:18:20 +0100	[diff] [blame]	21	# Store metadata and texts in lists
lora-sp	4201a5e	2023-03-09 16:19:57 +0100	[diff] [blame]	22	titles = root.findall(".//*[@type='title']")
				23	domains = root.findall(".//*[@type='domain']")
				24	pageURLs = root.findall(".//*[@type='pageURL']")
				25	ids = root.findall(".//*[@type='id']")
				26	mainImageURLs = root.findall(".//*[@type='mainImageURL']")
				27	mainImageTexts = root.findall(".//*[@type='mainImageTexts']")
				28	mainImageSources = root.findall(".//*[@type='mainImageSources']")
				29	authors = root.findall(".//*[@type='authors']")
				30	authorURLs = root.findall(".//*[@type='authorURLs']")
				31	categories = root.findall(".//*[@type='category']")
				32	subCategories = root.findall(".//*[@type='subCategory']")
				33	tags = root.findall(".//*[@type='tags']")
				34	datesPublished = root.findall(".//*[@type='datePublished']")
				35	timesPublished = root.findall(".//*[@type='timePublished']")
				36	datesModified = root.findall(".//*[@type='dateModified']")
				37	timesModified = root.findall(".//*[@type='timeModified']")
				38	mainImageWidths = root.findall(".//*[@type='mainImageWidth']")
				39	mainImageHeights = root.findall(".//*[@type='mainImageHeight']")
				40	mainImageThumbnailURLs = root.findall(".//*[@type='mainImageThumbnailURL']")
				41	texts = []
lora-sp	d9b0068	2023-03-09 10:18:20 +0100	[diff] [blame]	42	# Count text elements and remove metadata
lora-sp	4201a5e	2023-03-09 16:19:57 +0100	[diff] [blame]	43	number_of_texts = 0
				44	for text in root.iter("{http://www.tei-c.org/ns/1.0}text"):
				45	number_of_texts+=1
				46	for body in text:
				47	for div1 in body:
				48	for div2 in div1:
				49	for div3 in div2:
				50	if div3.get('type') == "metadata":
				51	div2.remove(div3)
lora-sp	8251163	2023-03-09 09:12:17 +0100	[diff] [blame]	52
lora-sp	4201a5e	2023-03-09 16:19:57 +0100	[diff] [blame]	53	texts.append(div2)
lora-sp	d9b0068	2023-03-09 10:18:20 +0100	[diff] [blame]	54	# Remove all elements from root
lora-sp	4201a5e	2023-03-09 16:19:57 +0100	[diff] [blame]	55	for elem in root.findall("*"):
				56	root.remove(elem)
lora-sp	d9b0068	2023-03-09 10:18:20 +0100	[diff] [blame]	57	# Rename root
lora-sp	4201a5e	2023-03-09 16:19:57 +0100	[diff] [blame]	58	root.tag = "teiDoc"
lora-sp	d9b0068	2023-03-09 10:18:20 +0100	[diff] [blame]	59	# Create i5 structure
lora-sp	4201a5e	2023-03-09 16:19:57 +0100	[diff] [blame]	60	for i in range(number_of_texts):
				61	tei = ET.SubElement(root, "TEI")
				62	teiHeader = ET.SubElement(tei, "teiHeader")
				63	fileDesc = ET.SubElement(teiHeader, "fileDesc")
				64	titleStmt = ET.SubElement(fileDesc, "titleStmt")
				65	textSigle = ET.SubElement(titleStmt, "textSigle")
				66	textSigle.text = "BNC/" + f"{j:02}" + "." + f"{i:05}"
				67	sourceDesc = ET.SubElement(fileDesc, "sourceDesc")
				68	analytic = ET.SubElement(sourceDesc, "analytic")
				69	htitle = ET.SubElement(analytic, "h.title")
				70	htitle.text = titles[i].text
				71	hauthor = ET.SubElement(analytic, "h.author")
				72	hauthor.text = authors[i].text
				73	imprint = ET.SubElement(sourceDesc, "imprint")
				74	pubDateYear = ET.SubElement(imprint, "pubDate")
				75	pubDateYear.set("type", "year")
				76	pubDateYear.text = datesPublished[i].text[0:4]
				77	pubDateMonth = ET.SubElement(imprint, "pubDate")
				78	pubDateMonth.set("type", "month")
				79	pubDateMonth.text = datesPublished[i].text[5:7]
				80	pubDateDay = ET.SubElement(imprint, "pubDate")
				81	pubDateDay.set("type", "day")
				82	pubDateDay.text = datesPublished[i].text[8:10]
				83	pubDateTime = ET.SubElement(imprint, "pubDate")
				84	pubDateTime.set("type", "time")
				85	pubDateTime.text = timesPublished[i].text
				86	pubPlace = ET.SubElement(imprint, "pubPlace")
				87	ref = ET.SubElement(pubPlace, "ref")
				88	ref.set("type", "page_url")
				89	ref.set("target", pageURLs[i].text)
				90	text = ET.SubElement(tei, "text")
				91	body = ET.SubElement(text, "body")
				92	for p in texts[i]:
				93	body.append(p)
lora-sp	8251163	2023-03-09 09:12:17 +0100	[diff] [blame]	94
lora-sp	132c3e5	2023-03-09 16:32:37 +0100	[diff] [blame]	95
				96	corpusRoot.append(root)
lora-sp	4201a5e	2023-03-09 16:19:57 +0100	[diff] [blame]	97	ET.register_namespace("", "http://www.tei-c.org/ns/1.0")
lora-sp	132c3e5	2023-03-09 16:32:37 +0100	[diff] [blame]	98
				99	return
				100
				101