Blame - bunc2tei.py - lora-sp/bunc2tei

blob: 055f45c16dbfe7c1f6e4b24c32d57bdfe6853247 [file] [log] [blame]

lora-sp	ab4e0ea	2023-03-10 12:02:24 +0100	[diff] [blame]	1	import os, sys
lora-sp	6697864	2023-03-08 11:02:52 +0100	[diff] [blame]	2	import xml.etree.ElementTree as ET
lora-sp	ab4e0ea	2023-03-10 12:02:24 +0100	[diff] [blame]	3	from xml.dom import minidom
lora-sp	09a58a0	2023-03-10 16:33:46 +0100	[diff] [blame]	4	from lxml import etree
				5	from io import StringIO
lora-sp	ab4e0ea	2023-03-10 12:02:24 +0100	[diff] [blame]	6
lora-sp	6697864	2023-03-08 11:02:52 +0100	[diff] [blame]	7
lora-sp	4201a5e	2023-03-09 16:19:57 +0100	[diff] [blame]	8	def main():
lora-sp	ab4e0ea	2023-03-10 12:02:24 +0100	[diff] [blame]	9	# Create corpus structure from string and save into file
				10	corpus = "<teiCorpus>\n</teiCorpus>"
				11	origRoot = ET.fromstring(corpus)
				12	corpusStr = minidom.parseString(ET.tostring(origRoot)).toprettyxml(indent=" ")
lora-sp	4eea8a6	2023-03-13 14:08:43 +0100	[diff] [blame]	13	if not os.path.exists('input'):
				14	os.makedirs("input")
				15	if not os.path.exists('output'):
				16	os.makedirs("output")
				17	with open("input/tree_structure.xml", "w") as f:
lora-sp	ab4e0ea	2023-03-10 12:02:24 +0100	[diff] [blame]	18	f.write(corpusStr)
lora-sp	132c3e5	2023-03-09 16:32:37 +0100	[diff] [blame]	19
lora-sp	09a58a0	2023-03-10 16:33:46 +0100	[diff] [blame]	20	# Parse corpus tree
lora-sp	4eea8a6	2023-03-13 14:08:43 +0100	[diff] [blame]	21	corpusTree = ET.parse("input/tree_structure.xml")
lora-sp	09a58a0	2023-03-10 16:33:46 +0100	[diff] [blame]	22	corpusRoot = corpusTree.getroot()
				23
				24	# Process documents and append to corpus tree
				25	for j in range(1, len(sys.argv)):
				26	try:
lora-sp	fb3b5bc	2023-03-13 11:59:34 +0100	[diff] [blame]	27	currentTree = convert(j-1, sys.argv[j])
lora-sp	09a58a0	2023-03-10 16:33:46 +0100	[diff] [blame]	28	currentRoot = currentTree.getroot()
				29	#print(currentRoot.tag)
				30	corpusRoot.append(currentRoot)
				31	except:
lora-sp	a158640	2023-03-13 15:58:30 +0100	[diff] [blame]	32	print(sys.argv[j])
lora-sp	09a58a0	2023-03-10 16:33:46 +0100	[diff] [blame]	33	continue
				34
				35	# Indent and save tree
lora-sp	132c3e5	2023-03-09 16:32:37 +0100	[diff] [blame]	36	ET.indent(corpusTree, " ")
lora-sp	4eea8a6	2023-03-13 14:08:43 +0100	[diff] [blame]	37	corpusTree.write("output/corpus.p5.xml", encoding='utf-8', xml_declaration=True, method='xml', short_empty_elements=True)
lora-sp	6697864	2023-03-08 11:02:52 +0100	[diff] [blame]	38
lora-sp	ab4e0ea	2023-03-10 12:02:24 +0100	[diff] [blame]	39
lora-sp	fb3b5bc	2023-03-13 11:59:34 +0100	[diff] [blame]	40	def convert(j, file):
lora-sp	ab4e0ea	2023-03-10 12:02:24 +0100	[diff] [blame]	41	# Parse document tree and get root
lora-sp	4201a5e	2023-03-09 16:19:57 +0100	[diff] [blame]	42	tree = ET.parse(file)
				43	root = tree.getroot()
lora-sp	09a58a0	2023-03-10 16:33:46 +0100	[diff] [blame]	44	ET.register_namespace("", "http://www.tei-c.org/ns/1.0")
				45
lora-sp	ab4e0ea	2023-03-10 12:02:24 +0100	[diff] [blame]	46	# Store metadata and texts in lists
lora-sp	4201a5e	2023-03-09 16:19:57 +0100	[diff] [blame]	47	titles = root.findall(".//*[@type='title']")
lora-sp	ab4e0ea	2023-03-10 12:02:24 +0100	[diff] [blame]	48	#domains = root.findall(".//*[@type='domain']")
lora-sp	4201a5e	2023-03-09 16:19:57 +0100	[diff] [blame]	49	pageURLs = root.findall(".//*[@type='pageURL']")
lora-sp	ab4e0ea	2023-03-10 12:02:24 +0100	[diff] [blame]	50	#ids = root.findall(".//*[@type='id']")
				51	#mainImageURLs = root.findall(".//*[@type='mainImageURL']")
				52	#mainImageTexts = root.findall(".//*[@type='mainImageTexts']")
				53	#mainImageSources = root.findall(".//*[@type='mainImageSources']")
lora-sp	4201a5e	2023-03-09 16:19:57 +0100	[diff] [blame]	54	authors = root.findall(".//*[@type='authors']")
lora-sp	ab4e0ea	2023-03-10 12:02:24 +0100	[diff] [blame]	55	#authorURLs = root.findall(".//*[@type='authorURLs']")
				56	#categories = root.findall(".//*[@type='category']")
				57	#subCategories = root.findall(".//*[@type='subCategory']")
				58	#tags = root.findall(".//*[@type='tags']")
lora-sp	4201a5e	2023-03-09 16:19:57 +0100	[diff] [blame]	59	datesPublished = root.findall(".//*[@type='datePublished']")
				60	timesPublished = root.findall(".//*[@type='timePublished']")
lora-sp	ab4e0ea	2023-03-10 12:02:24 +0100	[diff] [blame]	61	#datesModified = root.findall(".//*[@type='dateModified']")
				62	#timesModified = root.findall(".//*[@type='timeModified']")
				63	#mainImageWidths = root.findall(".//*[@type='mainImageWidth']")
				64	#mainImageHeights = root.findall(".//*[@type='mainImageHeight']")
				65	#mainImageThumbnailURLs = root.findall(".//*[@type='mainImageThumbnailURL']")
lora-sp	4201a5e	2023-03-09 16:19:57 +0100	[diff] [blame]	66	texts = []
lora-sp	ab4e0ea	2023-03-10 12:02:24 +0100	[diff] [blame]	67
				68	# Count text elements and remove metadata
lora-sp	4201a5e	2023-03-09 16:19:57 +0100	[diff] [blame]	69	number_of_texts = 0
				70	for text in root.iter("{http://www.tei-c.org/ns/1.0}text"):
				71	number_of_texts+=1
				72	for body in text:
				73	for div1 in body:
				74	for div2 in div1:
				75	for div3 in div2:
				76	if div3.get('type') == "metadata":
				77	div2.remove(div3)
lora-sp	8251163	2023-03-09 09:12:17 +0100	[diff] [blame]	78
lora-sp	4201a5e	2023-03-09 16:19:57 +0100	[diff] [blame]	79	texts.append(div2)
lora-sp	ab4e0ea	2023-03-10 12:02:24 +0100	[diff] [blame]	80
				81	# Remove all elements from root
lora-sp	4201a5e	2023-03-09 16:19:57 +0100	[diff] [blame]	82	for elem in root.findall("*"):
				83	root.remove(elem)
lora-sp	ab4e0ea	2023-03-10 12:02:24 +0100	[diff] [blame]	84
				85	# Rename root
lora-sp	4201a5e	2023-03-09 16:19:57 +0100	[diff] [blame]	86	root.tag = "teiDoc"
lora-sp	ab4e0ea	2023-03-10 12:02:24 +0100	[diff] [blame]	87
				88	# Create target structure
lora-sp	4201a5e	2023-03-09 16:19:57 +0100	[diff] [blame]	89	for i in range(number_of_texts):
				90	tei = ET.SubElement(root, "TEI")
				91	teiHeader = ET.SubElement(tei, "teiHeader")
				92	fileDesc = ET.SubElement(teiHeader, "fileDesc")
				93	titleStmt = ET.SubElement(fileDesc, "titleStmt")
				94	textSigle = ET.SubElement(titleStmt, "textSigle")
lora-sp	ab4e0ea	2023-03-10 12:02:24 +0100	[diff] [blame]	95	textSigle.text = "BNC/" + f"{j:03}" + "." + f"{i:05}"
lora-sp	4201a5e	2023-03-09 16:19:57 +0100	[diff] [blame]	96	sourceDesc = ET.SubElement(fileDesc, "sourceDesc")
				97	analytic = ET.SubElement(sourceDesc, "analytic")
				98	htitle = ET.SubElement(analytic, "h.title")
				99	htitle.text = titles[i].text
				100	hauthor = ET.SubElement(analytic, "h.author")
				101	hauthor.text = authors[i].text
				102	imprint = ET.SubElement(sourceDesc, "imprint")
				103	pubDateYear = ET.SubElement(imprint, "pubDate")
				104	pubDateYear.set("type", "year")
				105	pubDateYear.text = datesPublished[i].text[0:4]
				106	pubDateMonth = ET.SubElement(imprint, "pubDate")
				107	pubDateMonth.set("type", "month")
				108	pubDateMonth.text = datesPublished[i].text[5:7]
				109	pubDateDay = ET.SubElement(imprint, "pubDate")
				110	pubDateDay.set("type", "day")
				111	pubDateDay.text = datesPublished[i].text[8:10]
				112	pubDateTime = ET.SubElement(imprint, "pubDate")
				113	pubDateTime.set("type", "time")
				114	pubDateTime.text = timesPublished[i].text
				115	pubPlace = ET.SubElement(imprint, "pubPlace")
				116	ref = ET.SubElement(pubPlace, "ref")
				117	ref.set("type", "page_url")
				118	ref.set("target", pageURLs[i].text)
				119	text = ET.SubElement(tei, "text")
				120	body = ET.SubElement(text, "body")
				121	for p in texts[i]:
				122	body.append(p)
lora-sp	8251163	2023-03-09 09:12:17 +0100	[diff] [blame]	123
lora-sp	09a58a0	2023-03-10 16:33:46 +0100	[diff] [blame]	124	return tree
lora-sp	132c3e5	2023-03-09 16:32:37 +0100	[diff] [blame]	125
lora-sp	132c3e5	2023-03-09 16:32:37 +0100	[diff] [blame]	126
lora-sp	09a58a0	2023-03-10 16:33:46 +0100	[diff] [blame]	127	main()