Blame - bunc2tei.py - lora-sp/bunc2tei

blob: fccc2c1a56b65b9264810af9ccaa3acd32c8ff89 [file] [log] [blame]

lora-sp	6697864	2023-03-08 11:02:52 +0100	[diff] [blame]	1	import os
				2	import xml.etree.ElementTree as ET
				3
lora-sp	d9b0068	2023-03-09 10:18:20 +0100	[diff] [blame]	4	# Path to documents
lora-sp	8251163	2023-03-09 09:12:17 +0100	[diff] [blame]	5	path = "/home/spassova/BGCorpusExamples"
lora-sp	6697864	2023-03-08 11:02:52 +0100	[diff] [blame]	6	files = os.listdir(path)
				7
lora-sp	d9b0068	2023-03-09 10:18:20 +0100	[diff] [blame]	8	# Parse tree and get root
lora-sp	8251163	2023-03-09 09:12:17 +0100	[diff] [blame]	9	tree = ET.parse(path + "/" + files[0])
lora-sp	6697864	2023-03-08 11:02:52 +0100	[diff] [blame]	10	root = tree.getroot()
				11
lora-sp	d9b0068	2023-03-09 10:18:20 +0100	[diff] [blame]	12	# Store metadata and texts in lists
lora-sp	8251163	2023-03-09 09:12:17 +0100	[diff] [blame]	13	titles = root.findall(".//*[@type='title']")
				14	domains = root.findall(".//*[@type='domain']")
				15	pageURLs = root.findall(".//*[@type='pageURL']")
				16	ids = root.findall(".//*[@type='id']")
				17	mainImageURLs = root.findall(".//*[@type='mainImageURL']")
				18	mainImageTexts = root.findall(".//*[@type='mainImageTexts']")
				19	mainImageSources = root.findall(".//*[@type='mainImageSources']")
				20	authors = root.findall(".//*[@type='authors']")
				21	authorURLs = root.findall(".//*[@type='authorURLs']")
				22	categories = root.findall(".//*[@type='category']")
				23	subCategories = root.findall(".//*[@type='subCategory']")
				24	tags = root.findall(".//*[@type='tags']")
				25	datesPublished = root.findall(".//*[@type='datePublished']")
				26	timesPublished = root.findall(".//*[@type='timePublished']")
				27	datesModified = root.findall(".//*[@type='dateModified']")
				28	timesModified = root.findall(".//*[@type='timeModified']")
				29	mainImageWidths = root.findall(".//*[@type='mainImageWidth']")
				30	mainImageHeights = root.findall(".//*[@type='mainImageHeight']")
				31	mainImageThumbnailURLs = root.findall(".//*[@type='mainImageThumbnailURL']")
				32	texts = []
				33
lora-sp	d9b0068	2023-03-09 10:18:20 +0100	[diff] [blame]	34	# Count text elements and remove metadata
lora-sp	8251163	2023-03-09 09:12:17 +0100	[diff] [blame]	35	number_of_texts = 0
				36	for text in root.iter("{http://www.tei-c.org/ns/1.0}text"):
				37	number_of_texts+=1
				38	for body in text:
				39	for div1 in body:
				40	for div2 in div1:
				41	for div3 in div2:
				42	if div3.get('type') == "metadata":
				43	div2.remove(div3)
				44
				45	texts.append(div2)
				46
				47
lora-sp	d9b0068	2023-03-09 10:18:20 +0100	[diff] [blame]	48	# Remove all elements from root
lora-sp	8251163	2023-03-09 09:12:17 +0100	[diff] [blame]	49	for elem in root.findall("*"):
				50	root.remove(elem)
				51
lora-sp	d9b0068	2023-03-09 10:18:20 +0100	[diff] [blame]	52	# Rename root
lora-sp	8251163	2023-03-09 09:12:17 +0100	[diff] [blame]	53	root.tag = "teiCorpus"
				54
lora-sp	d9b0068	2023-03-09 10:18:20 +0100	[diff] [blame]	55	# Create i5 structure
lora-sp	8251163	2023-03-09 09:12:17 +0100	[diff] [blame]	56	for i in range(number_of_texts):
				57	tei = ET.SubElement(root, "TEI")
				58	teiHeader = ET.SubElement(tei, "teiHeader")
				59	fileDesc = ET.SubElement(teiHeader, "fileDesc")
				60	titleStmt = ET.SubElement(fileDesc, "titleStmt")
				61	textSigle = ET.SubElement(titleStmt, "textSigle")
				62	textSigle.text = "BNC/TST." + f"{i:05}"
				63	sourceDesc = ET.SubElement(fileDesc, "sourceDesc")
				64	analytic = ET.SubElement(sourceDesc, "analytic")
				65	htitle = ET.SubElement(analytic, "h.title")
				66	htitle.text = titles[i].text
				67	hauthor = ET.SubElement(analytic, "h.author")
				68	hauthor.text = authors[i].text
				69	imprint = ET.SubElement(sourceDesc, "imprint")
				70	pubDateYear = ET.SubElement(imprint, "pubDate")
				71	pubDateYear.set("type", "year")
				72	pubDateYear.text = datesPublished[i].text[0:4]
				73	pubDateMonth = ET.SubElement(imprint, "pubDate")
				74	pubDateMonth.set("type", "month")
				75	pubDateMonth.text = datesPublished[i].text[5:7]
				76	pubDateDay = ET.SubElement(imprint, "pubDate")
				77	pubDateDay.set("type", "day")
				78	pubDateDay.text = datesPublished[i].text[8:10]
				79	pubDateTime = ET.SubElement(imprint, "pubDate")
				80	pubDateTime.set("type", "time")
				81	pubDateTime.text = timesPublished[i].text
lora-sp	1ffc87a	2023-03-09 09:28:59 +0100	[diff] [blame]	82	pubPlace = ET.SubElement(imprint, "pubPlace")
				83	ref = ET.SubElement(pubPlace, "ref")
				84	ref.set("type", "page_url")
				85	ref.set("target", pageURLs[i].text)
lora-sp	d9b0068	2023-03-09 10:18:20 +0100	[diff] [blame]	86	text = ET.SubElement(tei, "text")
				87	body = ET.SubElement(text, "body")
				88	for p in texts[i]:
				89	body.append(p)
lora-sp	8251163	2023-03-09 09:12:17 +0100	[diff] [blame]	90
				91
lora-sp	ea9ccda	2023-03-09 11:10:32 +0100	[diff] [blame^]	92
lora-sp	8251163	2023-03-09 09:12:17 +0100	[diff] [blame]	93	ET.indent(tree, " ")
				94	ET.register_namespace("", "http://www.tei-c.org/ns/1.0")
lora-sp	de81dab	2023-03-09 10:51:33 +0100	[diff] [blame]	95	tree.write("04_output.xml", encoding="utf-8", xml_declaration=True, method="xml", short_empty_elements=True)