Blame - bunc2tei.py - lora-sp/bunc2tei

blob: a8dbe9d8f84e1659d71982c2f0354bf7c3653a2e [file] [log] [blame]

lora-sp	6697864	2023-03-08 11:02:52 +0100	[diff] [blame]	1	import os
				2	import xml.etree.ElementTree as ET
				3
lora-sp	de81dab	2023-03-09 10:51:33 +0100	[diff] [blame]	4	<<<<<<< HEAD
lora-sp	d9b0068	2023-03-09 10:18:20 +0100	[diff] [blame]	5	# Path to documents
lora-sp	8251163	2023-03-09 09:12:17 +0100	[diff] [blame]	6	path = "/home/spassova/BGCorpusExamples"
lora-sp	6697864	2023-03-08 11:02:52 +0100	[diff] [blame]	7	files = os.listdir(path)
				8
lora-sp	d9b0068	2023-03-09 10:18:20 +0100	[diff] [blame]	9	# Parse tree and get root
lora-sp	8251163	2023-03-09 09:12:17 +0100	[diff] [blame]	10	tree = ET.parse(path + "/" + files[0])
lora-sp	6697864	2023-03-08 11:02:52 +0100	[diff] [blame]	11	root = tree.getroot()
				12
lora-sp	d9b0068	2023-03-09 10:18:20 +0100	[diff] [blame]	13	# Store metadata and texts in lists
lora-sp	8251163	2023-03-09 09:12:17 +0100	[diff] [blame]	14	titles = root.findall(".//*[@type='title']")
				15	domains = root.findall(".//*[@type='domain']")
				16	pageURLs = root.findall(".//*[@type='pageURL']")
				17	ids = root.findall(".//*[@type='id']")
				18	mainImageURLs = root.findall(".//*[@type='mainImageURL']")
				19	mainImageTexts = root.findall(".//*[@type='mainImageTexts']")
				20	mainImageSources = root.findall(".//*[@type='mainImageSources']")
				21	authors = root.findall(".//*[@type='authors']")
				22	authorURLs = root.findall(".//*[@type='authorURLs']")
				23	categories = root.findall(".//*[@type='category']")
				24	subCategories = root.findall(".//*[@type='subCategory']")
				25	tags = root.findall(".//*[@type='tags']")
				26	datesPublished = root.findall(".//*[@type='datePublished']")
				27	timesPublished = root.findall(".//*[@type='timePublished']")
				28	datesModified = root.findall(".//*[@type='dateModified']")
				29	timesModified = root.findall(".//*[@type='timeModified']")
				30	mainImageWidths = root.findall(".//*[@type='mainImageWidth']")
				31	mainImageHeights = root.findall(".//*[@type='mainImageHeight']")
				32	mainImageThumbnailURLs = root.findall(".//*[@type='mainImageThumbnailURL']")
				33	texts = []
				34
lora-sp	d9b0068	2023-03-09 10:18:20 +0100	[diff] [blame]	35	# Count text elements and remove metadata
lora-sp	8251163	2023-03-09 09:12:17 +0100	[diff] [blame]	36	number_of_texts = 0
				37	for text in root.iter("{http://www.tei-c.org/ns/1.0}text"):
				38	number_of_texts+=1
				39	for body in text:
				40	for div1 in body:
				41	for div2 in div1:
				42	for div3 in div2:
				43	if div3.get('type') == "metadata":
				44	div2.remove(div3)
				45
				46	texts.append(div2)
				47
				48
lora-sp	d9b0068	2023-03-09 10:18:20 +0100	[diff] [blame]	49	# Remove all elements from root
lora-sp	8251163	2023-03-09 09:12:17 +0100	[diff] [blame]	50	for elem in root.findall("*"):
				51	root.remove(elem)
				52
lora-sp	d9b0068	2023-03-09 10:18:20 +0100	[diff] [blame]	53	# Rename root
lora-sp	8251163	2023-03-09 09:12:17 +0100	[diff] [blame]	54	root.tag = "teiCorpus"
				55
lora-sp	d9b0068	2023-03-09 10:18:20 +0100	[diff] [blame]	56	# Create i5 structure
lora-sp	8251163	2023-03-09 09:12:17 +0100	[diff] [blame]	57	for i in range(number_of_texts):
				58	tei = ET.SubElement(root, "TEI")
				59	teiHeader = ET.SubElement(tei, "teiHeader")
				60	fileDesc = ET.SubElement(teiHeader, "fileDesc")
				61	titleStmt = ET.SubElement(fileDesc, "titleStmt")
				62	textSigle = ET.SubElement(titleStmt, "textSigle")
				63	textSigle.text = "BNC/TST." + f"{i:05}"
				64	sourceDesc = ET.SubElement(fileDesc, "sourceDesc")
				65	analytic = ET.SubElement(sourceDesc, "analytic")
				66	htitle = ET.SubElement(analytic, "h.title")
				67	htitle.text = titles[i].text
				68	hauthor = ET.SubElement(analytic, "h.author")
				69	hauthor.text = authors[i].text
				70	imprint = ET.SubElement(sourceDesc, "imprint")
				71	pubDateYear = ET.SubElement(imprint, "pubDate")
				72	pubDateYear.set("type", "year")
				73	pubDateYear.text = datesPublished[i].text[0:4]
				74	pubDateMonth = ET.SubElement(imprint, "pubDate")
				75	pubDateMonth.set("type", "month")
				76	pubDateMonth.text = datesPublished[i].text[5:7]
				77	pubDateDay = ET.SubElement(imprint, "pubDate")
				78	pubDateDay.set("type", "day")
				79	pubDateDay.text = datesPublished[i].text[8:10]
				80	pubDateTime = ET.SubElement(imprint, "pubDate")
				81	pubDateTime.set("type", "time")
				82	pubDateTime.text = timesPublished[i].text
lora-sp	1ffc87a	2023-03-09 09:28:59 +0100	[diff] [blame]	83	pubPlace = ET.SubElement(imprint, "pubPlace")
				84	ref = ET.SubElement(pubPlace, "ref")
				85	ref.set("type", "page_url")
				86	ref.set("target", pageURLs[i].text)
lora-sp	d9b0068	2023-03-09 10:18:20 +0100	[diff] [blame]	87	text = ET.SubElement(tei, "text")
				88	body = ET.SubElement(text, "body")
				89	for p in texts[i]:
				90	body.append(p)
lora-sp	de81dab	2023-03-09 10:51:33 +0100	[diff] [blame]	91	=======
Lora Spassova	9dde8ef	2023-03-09 08:20:41 +0100	[diff] [blame]	92	path = '/home/spassova/BGCorpusExamples'
				93	files = os.listdir(path)
				94
				95	tree = ET.parse(path + '/' + files[0])
				96	root = tree.getroot()
				97
				98	titles = root.findall('.//*[@type='title']')
				99	domains = root.findall('.//*[@type='domain']')
				100	pageURLs = root.findall('.//*[@type='pageURL']')
				101	ids = root.findall('.//*[@type='id']')
				102	mainImageURLs = root.findall('.//*[@type='mainImageURL']')
				103	mainImageTexts = root.findall('.//*[@type='mainImageTexts']')
				104	mainImageSources = root.findall('.//*[@type='mainImageSources']')
				105	authors = root.findall('.//*[@type='authors']')
				106	authorURLs = root.findall('.//*[@type='authorURLs']')
				107	categories = root.findall('.//*[@type='category']')
				108	subCategories = root.findall('.//*[@type='subCategory']')
				109	tags = root.findall('.//*[@type='tags']')
				110	datesPublished = root.findall('.//*[@type='datePublished']')
				111	timesPublished = root.findall('.//*[@type='timePubished']')
				112	datesModified = root.findall('.//*[@type='dateModified']')
				113	timesModified = root.findall('.//*[@type='timeModified']')
				114
				115	number_of_texts = len(root.findall('{http://www.tei-c.org/ns/1.0}text'))
				116
				117	for elem in root.findall('*'):
				118	root.remove(elem)
				119
				120	root.tag = 'teiCorpus'
				121
				122	for i in range(number_of_texts):
				123	tei = ET.SubElement(root, 'TEI')
				124	teiHeader = ET.SubElement(tei, 'teiHeader')
				125	fileDesc = ET.SubElement(teiHeader, 'fileDesc')
				126	titleStmt = ET.SubElement(fileDesc, 'titleStmt')
				127	textSigle = ET.SubElement(titleStmt, 'textSigle')
				128	textSigle.text = 'BNC/TST.' + f'{i:05}'
				129	sourceDesc = ET.SubElement(fileDesc, 'sourceDesc')
				130
				131
				132
				133
lora-sp	de81dab	2023-03-09 10:51:33 +0100	[diff] [blame]	134	>>>>>>> refs/remotes/origin/main
lora-sp	6697864	2023-03-08 11:02:52 +0100	[diff] [blame]	135
lora-sp	8251163	2023-03-09 09:12:17 +0100	[diff] [blame]	136
				137
				138
lora-sp	8251163	2023-03-09 09:12:17 +0100	[diff] [blame]	139	ET.indent(tree, " ")
				140	ET.register_namespace("", "http://www.tei-c.org/ns/1.0")
lora-sp	de81dab	2023-03-09 10:51:33 +0100	[diff] [blame]	141	tree.write("04_output.xml", encoding="utf-8", xml_declaration=True, method="xml", short_empty_elements=True)