Blame - bunc2tei.py - lora-sp/bunc2tei

blob: 44d1cecd34dbfa3a96252545f4cf2048d9eb5088 [file] [log] [blame]

lora-sp	ab4e0ea	2023-03-10 12:02:24 +0100	[diff] [blame]	1	import os, sys
lora-sp	6697864	2023-03-08 11:02:52 +0100	[diff] [blame]	2	import xml.etree.ElementTree as ET
lora-sp	ab4e0ea	2023-03-10 12:02:24 +0100	[diff] [blame]	3	from xml.dom import minidom
lora-sp	09a58a0	2023-03-10 16:33:46 +0100	[diff] [blame]	4	from lxml import etree
				5	from io import StringIO
lora-sp	ab4e0ea	2023-03-10 12:02:24 +0100	[diff] [blame]	6
lora-sp	6697864	2023-03-08 11:02:52 +0100	[diff] [blame]	7
lora-sp	4201a5e	2023-03-09 16:19:57 +0100	[diff] [blame]	8	def main():
lora-sp	ab4e0ea	2023-03-10 12:02:24 +0100	[diff] [blame]	9	# Create corpus structure from string and save into file
				10	corpus = "<teiCorpus>\n</teiCorpus>"
				11	origRoot = ET.fromstring(corpus)
				12	corpusStr = minidom.parseString(ET.tostring(origRoot)).toprettyxml(indent=" ")
lora-sp	fb3b5bc	2023-03-13 11:59:34 +0100	[diff] [blame]	13	with open("/input/tree_structure.xml", "w") as f:
lora-sp	ab4e0ea	2023-03-10 12:02:24 +0100	[diff] [blame]	14	f.write(corpusStr)
lora-sp	132c3e5	2023-03-09 16:32:37 +0100	[diff] [blame]	15
lora-sp	09a58a0	2023-03-10 16:33:46 +0100	[diff] [blame]	16	# Parse corpus tree
lora-sp	fb3b5bc	2023-03-13 11:59:34 +0100	[diff] [blame]	17	corpusTree = ET.parse("/input/tree_structure.xml")
lora-sp	09a58a0	2023-03-10 16:33:46 +0100	[diff] [blame]	18	corpusRoot = corpusTree.getroot()
				19
				20	# Process documents and append to corpus tree
				21	for j in range(1, len(sys.argv)):
				22	try:
lora-sp	fb3b5bc	2023-03-13 11:59:34 +0100	[diff] [blame]	23	currentTree = convert(j-1, sys.argv[j])
lora-sp	09a58a0	2023-03-10 16:33:46 +0100	[diff] [blame]	24	currentRoot = currentTree.getroot()
				25	#print(currentRoot.tag)
				26	corpusRoot.append(currentRoot)
				27	except:
				28	print("sorry")
				29	continue
				30
				31	# Indent and save tree
lora-sp	132c3e5	2023-03-09 16:32:37 +0100	[diff] [blame]	32	ET.indent(corpusTree, " ")
lora-sp	fb3b5bc	2023-03-13 11:59:34 +0100	[diff] [blame]	33	corpusTree.write("/output/corpus.p5.xml", encoding='utf-8', xml_declaration=True, method='xml', short_empty_elements=True)
lora-sp	6697864	2023-03-08 11:02:52 +0100	[diff] [blame]	34
lora-sp	ab4e0ea	2023-03-10 12:02:24 +0100	[diff] [blame]	35
lora-sp	fb3b5bc	2023-03-13 11:59:34 +0100	[diff] [blame]	36	def convert(j, file):
lora-sp	ab4e0ea	2023-03-10 12:02:24 +0100	[diff] [blame]	37	# Parse document tree and get root
lora-sp	4201a5e	2023-03-09 16:19:57 +0100	[diff] [blame]	38	tree = ET.parse(file)
				39	root = tree.getroot()
lora-sp	09a58a0	2023-03-10 16:33:46 +0100	[diff] [blame]	40	ET.register_namespace("", "http://www.tei-c.org/ns/1.0")
				41
lora-sp	ab4e0ea	2023-03-10 12:02:24 +0100	[diff] [blame]	42	# Store metadata and texts in lists
lora-sp	4201a5e	2023-03-09 16:19:57 +0100	[diff] [blame]	43	titles = root.findall(".//*[@type='title']")
lora-sp	ab4e0ea	2023-03-10 12:02:24 +0100	[diff] [blame]	44	#domains = root.findall(".//*[@type='domain']")
lora-sp	4201a5e	2023-03-09 16:19:57 +0100	[diff] [blame]	45	pageURLs = root.findall(".//*[@type='pageURL']")
lora-sp	ab4e0ea	2023-03-10 12:02:24 +0100	[diff] [blame]	46	#ids = root.findall(".//*[@type='id']")
				47	#mainImageURLs = root.findall(".//*[@type='mainImageURL']")
				48	#mainImageTexts = root.findall(".//*[@type='mainImageTexts']")
				49	#mainImageSources = root.findall(".//*[@type='mainImageSources']")
lora-sp	4201a5e	2023-03-09 16:19:57 +0100	[diff] [blame]	50	authors = root.findall(".//*[@type='authors']")
lora-sp	ab4e0ea	2023-03-10 12:02:24 +0100	[diff] [blame]	51	#authorURLs = root.findall(".//*[@type='authorURLs']")
				52	#categories = root.findall(".//*[@type='category']")
				53	#subCategories = root.findall(".//*[@type='subCategory']")
				54	#tags = root.findall(".//*[@type='tags']")
lora-sp	4201a5e	2023-03-09 16:19:57 +0100	[diff] [blame]	55	datesPublished = root.findall(".//*[@type='datePublished']")
				56	timesPublished = root.findall(".//*[@type='timePublished']")
lora-sp	ab4e0ea	2023-03-10 12:02:24 +0100	[diff] [blame]	57	#datesModified = root.findall(".//*[@type='dateModified']")
				58	#timesModified = root.findall(".//*[@type='timeModified']")
				59	#mainImageWidths = root.findall(".//*[@type='mainImageWidth']")
				60	#mainImageHeights = root.findall(".//*[@type='mainImageHeight']")
				61	#mainImageThumbnailURLs = root.findall(".//*[@type='mainImageThumbnailURL']")
lora-sp	4201a5e	2023-03-09 16:19:57 +0100	[diff] [blame]	62	texts = []
lora-sp	ab4e0ea	2023-03-10 12:02:24 +0100	[diff] [blame]	63
				64	# Count text elements and remove metadata
lora-sp	4201a5e	2023-03-09 16:19:57 +0100	[diff] [blame]	65	number_of_texts = 0
				66	for text in root.iter("{http://www.tei-c.org/ns/1.0}text"):
				67	number_of_texts+=1
				68	for body in text:
				69	for div1 in body:
				70	for div2 in div1:
				71	for div3 in div2:
				72	if div3.get('type') == "metadata":
				73	div2.remove(div3)
lora-sp	8251163	2023-03-09 09:12:17 +0100	[diff] [blame]	74
lora-sp	4201a5e	2023-03-09 16:19:57 +0100	[diff] [blame]	75	texts.append(div2)
lora-sp	ab4e0ea	2023-03-10 12:02:24 +0100	[diff] [blame]	76
				77	# Remove all elements from root
lora-sp	4201a5e	2023-03-09 16:19:57 +0100	[diff] [blame]	78	for elem in root.findall("*"):
				79	root.remove(elem)
lora-sp	ab4e0ea	2023-03-10 12:02:24 +0100	[diff] [blame]	80
				81	# Rename root
lora-sp	4201a5e	2023-03-09 16:19:57 +0100	[diff] [blame]	82	root.tag = "teiDoc"
lora-sp	ab4e0ea	2023-03-10 12:02:24 +0100	[diff] [blame]	83
				84	# Create target structure
lora-sp	4201a5e	2023-03-09 16:19:57 +0100	[diff] [blame]	85	for i in range(number_of_texts):
				86	tei = ET.SubElement(root, "TEI")
				87	teiHeader = ET.SubElement(tei, "teiHeader")
				88	fileDesc = ET.SubElement(teiHeader, "fileDesc")
				89	titleStmt = ET.SubElement(fileDesc, "titleStmt")
				90	textSigle = ET.SubElement(titleStmt, "textSigle")
lora-sp	ab4e0ea	2023-03-10 12:02:24 +0100	[diff] [blame]	91	textSigle.text = "BNC/" + f"{j:03}" + "." + f"{i:05}"
lora-sp	4201a5e	2023-03-09 16:19:57 +0100	[diff] [blame]	92	sourceDesc = ET.SubElement(fileDesc, "sourceDesc")
				93	analytic = ET.SubElement(sourceDesc, "analytic")
				94	htitle = ET.SubElement(analytic, "h.title")
				95	htitle.text = titles[i].text
				96	hauthor = ET.SubElement(analytic, "h.author")
				97	hauthor.text = authors[i].text
				98	imprint = ET.SubElement(sourceDesc, "imprint")
				99	pubDateYear = ET.SubElement(imprint, "pubDate")
				100	pubDateYear.set("type", "year")
				101	pubDateYear.text = datesPublished[i].text[0:4]
				102	pubDateMonth = ET.SubElement(imprint, "pubDate")
				103	pubDateMonth.set("type", "month")
				104	pubDateMonth.text = datesPublished[i].text[5:7]
				105	pubDateDay = ET.SubElement(imprint, "pubDate")
				106	pubDateDay.set("type", "day")
				107	pubDateDay.text = datesPublished[i].text[8:10]
				108	pubDateTime = ET.SubElement(imprint, "pubDate")
				109	pubDateTime.set("type", "time")
				110	pubDateTime.text = timesPublished[i].text
				111	pubPlace = ET.SubElement(imprint, "pubPlace")
				112	ref = ET.SubElement(pubPlace, "ref")
				113	ref.set("type", "page_url")
				114	ref.set("target", pageURLs[i].text)
				115	text = ET.SubElement(tei, "text")
				116	body = ET.SubElement(text, "body")
				117	for p in texts[i]:
				118	body.append(p)
lora-sp	8251163	2023-03-09 09:12:17 +0100	[diff] [blame]	119
lora-sp	09a58a0	2023-03-10 16:33:46 +0100	[diff] [blame]	120	return tree
lora-sp	132c3e5	2023-03-09 16:32:37 +0100	[diff] [blame]	121
lora-sp	132c3e5	2023-03-09 16:32:37 +0100	[diff] [blame]	122
lora-sp	09a58a0	2023-03-10 16:33:46 +0100	[diff] [blame]	123	main()