Blame - bunc2tei.py - lora-sp/bunc2tei

blob: c1425673aa4c45d22be8df755feca6ffd675cd1d [file] [log] [blame]

lora-sp	95b8f92	2023-04-06 11:28:19 +0200	[diff] [blame^]	1	import sys
lora-sp	6697864	2023-03-08 11:02:52 +0100	[diff] [blame]	2	import xml.etree.ElementTree as ET
lora-sp	ab4e0ea	2023-03-10 12:02:24 +0100	[diff] [blame]	3
lora-sp	6697864	2023-03-08 11:02:52 +0100	[diff] [blame]	4
lora-sp	4201a5e	2023-03-09 16:19:57 +0100	[diff] [blame]	5	def main():
lora-sp	95b8f92	2023-04-06 11:28:19 +0200	[diff] [blame^]	6	corpusRoot = ET.Element("teiCorpus")
lora-sp	132c3e5	2023-03-09 16:32:37 +0100	[diff] [blame]	7
lora-sp	09a58a0	2023-03-10 16:33:46 +0100	[diff] [blame]	8	for j in range(1, len(sys.argv)):
				9	try:
lora-sp	95b8f92	2023-04-06 11:28:19 +0200	[diff] [blame^]	10	doc_data = extract_data(sys.argv[j])
				11	doc_tree = create_tree(doc_data, j-1)
				12	currentRoot = doc_tree.getroot()
lora-sp	09a58a0	2023-03-10 16:33:46 +0100	[diff] [blame]	13	corpusRoot.append(currentRoot)
				14	except:
lora-sp	95b8f92	2023-04-06 11:28:19 +0200	[diff] [blame^]	15	print("Warning: could not parse file: " + sys.argv[j], file=sys.stderr)
lora-sp	09a58a0	2023-03-10 16:33:46 +0100	[diff] [blame]	16	continue
				17
lora-sp	95b8f92	2023-04-06 11:28:19 +0200	[diff] [blame^]	18	corpusTree = ET.ElementTree(corpusRoot)
lora-sp	132c3e5	2023-03-09 16:32:37 +0100	[diff] [blame]	19	ET.indent(corpusTree, " ")
lora-sp	95b8f92	2023-04-06 11:28:19 +0200	[diff] [blame^]	20	corpusTree.write(sys.stdout, encoding='unicode')
lora-sp	6697864	2023-03-08 11:02:52 +0100	[diff] [blame]	21
lora-sp	ab4e0ea	2023-03-10 12:02:24 +0100	[diff] [blame]	22
lora-sp	95b8f92	2023-04-06 11:28:19 +0200	[diff] [blame^]	23	def extract_data(file):
				24	''' Parses an xml file and saves the metadata and texts into a dictionary that is returned.
				25	The dictionary is of the following form:
				26	data = {filenumber: {title}, {url}, {author}, {date}, {time}, {text}}
				27	'''
				28
lora-sp	4201a5e	2023-03-09 16:19:57 +0100	[diff] [blame]	29	tree = ET.parse(file)
				30	root = tree.getroot()
lora-sp	09a58a0	2023-03-10 16:33:46 +0100	[diff] [blame]	31	ET.register_namespace("", "http://www.tei-c.org/ns/1.0")
lora-sp	ab4e0ea	2023-03-10 12:02:24 +0100	[diff] [blame]	32
lora-sp	95b8f92	2023-04-06 11:28:19 +0200	[diff] [blame^]	33	data = {}
lora-sp	8251163	2023-03-09 09:12:17 +0100	[diff] [blame]	34
lora-sp	95b8f92	2023-04-06 11:28:19 +0200	[diff] [blame^]	35	for i, text in enumerate(root.iter("{http://www.tei-c.org/ns/1.0}text")):
				36	data[i] = {}
				37	data[i]['title'] = text.get('title')
				38	data[i]['url'] = text.get('url')
				39	data[i]['author'] = text.get('author')
				40	data[i]['date'] = text.get('date').split(' ')[0]
				41	data[i]['time'] = text.get('date').split(' ')[1]
lora-sp	ab4e0ea	2023-03-10 12:02:24 +0100	[diff] [blame]	42
lora-sp	95b8f92	2023-04-06 11:28:19 +0200	[diff] [blame^]	43	textelem = text.find(".{http://www.tei-c.org/ns/1.0}body/{http://www.tei-c.org/ns/1.0}div/"\
				44	"{http://www.tei-c.org/ns/1.0}div")
				45
				46	data[i]['text'] = []
				47	for p in textelem.findall(".{http://www.tei-c.org/ns/1.0}p"):
				48	data[i]['text'].append(p)
				49
				50	return data
lora-sp	ab4e0ea	2023-03-10 12:02:24 +0100	[diff] [blame]	51
lora-sp	ab4e0ea	2023-03-10 12:02:24 +0100	[diff] [blame]	52
lora-sp	95b8f92	2023-04-06 11:28:19 +0200	[diff] [blame^]	53	def create_tree(data, filenumber):
				54	''' Receives a dictionary containing the data and returns an xml tree in the
				55	desired format. Generates text sigles of the following format: BNC/filenumber.textnumber,
				56	e.g. BNC/000.00000
				57	'''
				58	docRoot = ET.Element("teiDoc")
				59
				60	for i in range(len(data)):
				61	tei = ET.SubElement(docRoot, "TEI")
lora-sp	4201a5e	2023-03-09 16:19:57 +0100	[diff] [blame]	62	teiHeader = ET.SubElement(tei, "teiHeader")
				63	fileDesc = ET.SubElement(teiHeader, "fileDesc")
				64	titleStmt = ET.SubElement(fileDesc, "titleStmt")
				65	textSigle = ET.SubElement(titleStmt, "textSigle")
lora-sp	95b8f92	2023-04-06 11:28:19 +0200	[diff] [blame^]	66	textSigle.text = "BNC/" + f"{filenumber:03}" + "." + f"{i:05}"
lora-sp	4201a5e	2023-03-09 16:19:57 +0100	[diff] [blame]	67	sourceDesc = ET.SubElement(fileDesc, "sourceDesc")
				68	analytic = ET.SubElement(sourceDesc, "analytic")
				69	htitle = ET.SubElement(analytic, "h.title")
lora-sp	95b8f92	2023-04-06 11:28:19 +0200	[diff] [blame^]	70	htitle.text = data[i]['title']
lora-sp	4201a5e	2023-03-09 16:19:57 +0100	[diff] [blame]	71	hauthor = ET.SubElement(analytic, "h.author")
lora-sp	95b8f92	2023-04-06 11:28:19 +0200	[diff] [blame^]	72	hauthor.text = data[i]['author']
lora-sp	4201a5e	2023-03-09 16:19:57 +0100	[diff] [blame]	73	imprint = ET.SubElement(sourceDesc, "imprint")
				74	pubDateYear = ET.SubElement(imprint, "pubDate")
				75	pubDateYear.set("type", "year")
lora-sp	95b8f92	2023-04-06 11:28:19 +0200	[diff] [blame^]	76	pubDateYear.text = data[i]['date'][0:4]
lora-sp	4201a5e	2023-03-09 16:19:57 +0100	[diff] [blame]	77	pubDateMonth = ET.SubElement(imprint, "pubDate")
				78	pubDateMonth.set("type", "month")
lora-sp	95b8f92	2023-04-06 11:28:19 +0200	[diff] [blame^]	79	pubDateMonth.text = data[i]['date'][5:7]
lora-sp	4201a5e	2023-03-09 16:19:57 +0100	[diff] [blame]	80	pubDateDay = ET.SubElement(imprint, "pubDate")
				81	pubDateDay.set("type", "day")
lora-sp	95b8f92	2023-04-06 11:28:19 +0200	[diff] [blame^]	82	pubDateDay.text = data[i]['date'][8:10]
lora-sp	4201a5e	2023-03-09 16:19:57 +0100	[diff] [blame]	83	pubDateTime = ET.SubElement(imprint, "pubDate")
				84	pubDateTime.set("type", "time")
lora-sp	95b8f92	2023-04-06 11:28:19 +0200	[diff] [blame^]	85	pubDateTime.text = data[i]['time']
lora-sp	4201a5e	2023-03-09 16:19:57 +0100	[diff] [blame]	86	pubPlace = ET.SubElement(imprint, "pubPlace")
				87	ref = ET.SubElement(pubPlace, "ref")
				88	ref.set("type", "page_url")
lora-sp	95b8f92	2023-04-06 11:28:19 +0200	[diff] [blame^]	89	ref.set("target", data[i]['url'])
lora-sp	4201a5e	2023-03-09 16:19:57 +0100	[diff] [blame]	90	text = ET.SubElement(tei, "text")
				91	body = ET.SubElement(text, "body")
lora-sp	95b8f92	2023-04-06 11:28:19 +0200	[diff] [blame^]	92	for p in data[i]['text']:
lora-sp	4201a5e	2023-03-09 16:19:57 +0100	[diff] [blame]	93	body.append(p)
lora-sp	8251163	2023-03-09 09:12:17 +0100	[diff] [blame]	94
lora-sp	95b8f92	2023-04-06 11:28:19 +0200	[diff] [blame^]	95	docTree = ET.ElementTree(docRoot)
				96	ET.indent(docTree, " ")
				97
				98	return docTree
lora-sp	132c3e5	2023-03-09 16:32:37 +0100	[diff] [blame]	99
lora-sp	132c3e5	2023-03-09 16:32:37 +0100	[diff] [blame]	100
lora-sp	95b8f92	2023-04-06 11:28:19 +0200	[diff] [blame^]	101	main()