Blame - bunc2tei.py - lora-sp/bunc2tei

blob: 9870285bf355151604c3b081ec79b042799fc586 [file] [log] [blame]

lora-sp	6697864	2023-03-08 11:02:52 +0100	[diff] [blame]	1	import os
				2	import xml.etree.ElementTree as ET
				3
lora-sp	8251163	2023-03-09 09:12:17 +0100	[diff] [blame]	4	path = "/home/spassova/BGCorpusExamples"
lora-sp	6697864	2023-03-08 11:02:52 +0100	[diff] [blame]	5	files = os.listdir(path)
				6
lora-sp	8251163	2023-03-09 09:12:17 +0100	[diff] [blame]	7	tree = ET.parse(path + "/" + files[0])
lora-sp	6697864	2023-03-08 11:02:52 +0100	[diff] [blame]	8	root = tree.getroot()
				9
lora-sp	8251163	2023-03-09 09:12:17 +0100	[diff] [blame]	10	titles = root.findall(".//*[@type='title']")
				11	domains = root.findall(".//*[@type='domain']")
				12	pageURLs = root.findall(".//*[@type='pageURL']")
				13	ids = root.findall(".//*[@type='id']")
				14	mainImageURLs = root.findall(".//*[@type='mainImageURL']")
				15	mainImageTexts = root.findall(".//*[@type='mainImageTexts']")
				16	mainImageSources = root.findall(".//*[@type='mainImageSources']")
				17	authors = root.findall(".//*[@type='authors']")
				18	authorURLs = root.findall(".//*[@type='authorURLs']")
				19	categories = root.findall(".//*[@type='category']")
				20	subCategories = root.findall(".//*[@type='subCategory']")
				21	tags = root.findall(".//*[@type='tags']")
				22	datesPublished = root.findall(".//*[@type='datePublished']")
				23	timesPublished = root.findall(".//*[@type='timePublished']")
				24	datesModified = root.findall(".//*[@type='dateModified']")
				25	timesModified = root.findall(".//*[@type='timeModified']")
				26	mainImageWidths = root.findall(".//*[@type='mainImageWidth']")
				27	mainImageHeights = root.findall(".//*[@type='mainImageHeight']")
				28	mainImageThumbnailURLs = root.findall(".//*[@type='mainImageThumbnailURL']")
				29	texts = []
				30
				31	number_of_texts = 0
				32	for text in root.iter("{http://www.tei-c.org/ns/1.0}text"):
				33	number_of_texts+=1
				34	for body in text:
				35	for div1 in body:
				36	for div2 in div1:
				37	for div3 in div2:
				38	if div3.get('type') == "metadata":
				39	div2.remove(div3)
				40
				41	texts.append(div2)
				42
				43
				44	#number_of_texts = len(root.findall("{http://www.tei-c.org/ns/1.0}text"))
				45
				46	for elem in root.findall("*"):
				47	root.remove(elem)
				48
				49	root.tag = "teiCorpus"
				50
				51	for i in range(number_of_texts):
				52	tei = ET.SubElement(root, "TEI")
				53	teiHeader = ET.SubElement(tei, "teiHeader")
				54	fileDesc = ET.SubElement(teiHeader, "fileDesc")
				55	titleStmt = ET.SubElement(fileDesc, "titleStmt")
				56	textSigle = ET.SubElement(titleStmt, "textSigle")
				57	textSigle.text = "BNC/TST." + f"{i:05}"
				58	sourceDesc = ET.SubElement(fileDesc, "sourceDesc")
				59	analytic = ET.SubElement(sourceDesc, "analytic")
				60	htitle = ET.SubElement(analytic, "h.title")
				61	htitle.text = titles[i].text
				62	hauthor = ET.SubElement(analytic, "h.author")
				63	hauthor.text = authors[i].text
				64	imprint = ET.SubElement(sourceDesc, "imprint")
				65	pubDateYear = ET.SubElement(imprint, "pubDate")
				66	pubDateYear.set("type", "year")
				67	pubDateYear.text = datesPublished[i].text[0:4]
				68	pubDateMonth = ET.SubElement(imprint, "pubDate")
				69	pubDateMonth.set("type", "month")
				70	pubDateMonth.text = datesPublished[i].text[5:7]
				71	pubDateDay = ET.SubElement(imprint, "pubDate")
				72	pubDateDay.set("type", "day")
				73	pubDateDay.text = datesPublished[i].text[8:10]
				74	pubDateTime = ET.SubElement(imprint, "pubDate")
				75	pubDateTime.set("type", "time")
				76	pubDateTime.text = timesPublished[i].text
lora-sp	1ffc87a	2023-03-09 09:28:59 +0100	[diff] [blame]	77	pubPlace = ET.SubElement(imprint, "pubPlace")
				78	ref = ET.SubElement(pubPlace, "ref")
				79	ref.set("type", "page_url")
				80	ref.set("target", pageURLs[i].text)
lora-sp	6697864	2023-03-08 11:02:52 +0100	[diff] [blame]	81
				82
lora-sp	8251163	2023-03-09 09:12:17 +0100	[diff] [blame]	83
				84
				85
lora-sp	8251163	2023-03-09 09:12:17 +0100	[diff] [blame]	86	ET.indent(tree, " ")
				87	ET.register_namespace("", "http://www.tei-c.org/ns/1.0")
				88	tree.write("04_output.xml", encoding="utf-8", xml_declaration=True, method="xml", short_empty_elements=True)