Blame - nor2tei.py - ICC/nor2korap

blob: 2a28da53b91db340e0ab1d2746f136bc47de2bc1 [file] [log] [blame]

lora-sp	2e2a21a	2023-03-30 13:54:59 +0200	[diff] [blame]	1	import os, sys
				2	import xml.etree.ElementTree as ET
				3	from xml.dom import minidom
Marc Kupietz	17269e4	2023-05-25 10:59:01 +0200	[diff] [blame]	4	import traceback
				5	import sys
lora-sp	2e2a21a	2023-03-30 13:54:59 +0200	[diff] [blame]	6
				7	def main():
				8	# Create corpus structure from string and save into file
				9	corpus = "<teiCorpus>\n</teiCorpus>"
				10	origRoot = ET.fromstring(corpus)
				11	corpusStr = minidom.parseString(ET.tostring(origRoot)).toprettyxml(indent=" ")
				12	if not os.path.exists('input'):
				13	os.makedirs("input")
				14	if not os.path.exists('output'):
				15	os.makedirs("output")
				16	with open("input/tree_structure.xml", "w") as f:
				17	f.write(corpusStr)
				18
				19	# Parse corpus tree
				20	corpusTree = ET.parse("input/tree_structure.xml")
				21	corpusRoot = corpusTree.getroot()
				22
				23	# Process documents and append to corpus tree
				24	for j in range(1, len(sys.argv)):
Marc Kupietz	17269e4	2023-05-25 10:59:01 +0200	[diff] [blame]	25	try:
				26	currentTree = convert(j-1, sys.argv[j])
				27	currentRoot = currentTree.getroot()
				28	corpusRoot.append(currentRoot)
				29	except:
				30	print("ERROR:"+sys.argv[j])
				31	print(traceback.format_exc())
				32	print(sys.exc_info()[2])
				33	continue
lora-sp	2e2a21a	2023-03-30 13:54:59 +0200	[diff] [blame]	34
				35	# Indent and save tree
				36	ET.indent(corpusTree, " ")
				37	corpusTree.write("output/corpus.p5.xml", encoding='utf-8', xml_declaration=True, method='xml', short_empty_elements=True)
				38
				39
				40	def convert(j, file):
				41	# Parse document tree and get root
				42	tree = ET.parse(file)
				43	root = tree.getroot()
				44	ET.register_namespace("", "http://www.tei-c.org/ns/1.0")
				45
				46	# Store metadata and texts in lists
				47	titles = root.find(".//*{http://www.tei-c.org/ns/1.0}bibl/{http://www.tei-c.org/ns/1.0}title")
				48	uris = root.find(".//*{http://www.tei-c.org/ns/1.0}bibl/{http://www.tei-c.org/ns/1.0}pubPlace")
				49	authors = root.find(".//*{http://www.tei-c.org/ns/1.0}bibl/{http://www.tei-c.org/ns/1.0}author")
				50	dates = root.find(".//*{http://www.tei-c.org/ns/1.0}bibl/{http://www.tei-c.org/ns/1.0}date")
Marc Kupietz	13367dd	2023-05-25 11:00:33 +0200	[diff] [blame^]	51	texts = root.findall(".//*{http://www.tei-c.org/ns/1.0}body/{http://www.tei-c.org/ns/1.0}div")
lora-sp	2e2a21a	2023-03-30 13:54:59 +0200	[diff] [blame]	52	domains = root.find(".//*[@type]")
				53
				54	# Count text elements
				55	number_of_texts = 0
				56	for text in root.iter("{http://www.tei-c.org/ns/1.0}text"):
				57	for body in text:
				58	for div in body:
				59	number_of_texts+=1
				60
Marc Kupietz	13367dd	2023-05-25 11:00:33 +0200	[diff] [blame^]	61	number_of_texts = len(texts)
lora-sp	2e2a21a	2023-03-30 13:54:59 +0200	[diff] [blame]	62	# Remove all elements from root
				63	for elem in root.findall("*"):
				64	root.remove(elem)
				65
				66	# Rename root
				67	root.tag = "teiDoc"
				68
				69	# Create target structure
				70	for i in range(number_of_texts):
				71	tei = ET.SubElement(root, "TEI")
				72	teiHeader = ET.SubElement(tei, "teiHeader")
				73	fileDesc = ET.SubElement(teiHeader, "fileDesc")
				74	titleStmt = ET.SubElement(fileDesc, "titleStmt")
				75	textSigle = ET.SubElement(titleStmt, "textSigle")
Marc Kupietz	386c8af	2023-05-23 17:56:42 +0200	[diff] [blame]	76	textSigle.text = "NOR/" + f"{j:03}" + "." + f"{i:05}"
lora-sp	2e2a21a	2023-03-30 13:54:59 +0200	[diff] [blame]	77	sourceDesc = ET.SubElement(fileDesc, "sourceDesc")
				78	analytic = ET.SubElement(sourceDesc, "analytic")
				79	title = ET.SubElement(titleStmt, "title")
				80	title.text = titles.text
				81	hauthor = ET.SubElement(analytic, "h.author")
				82	hauthor.text = authors.text
				83	imprint = ET.SubElement(sourceDesc, "imprint")
				84	pubDateYear = ET.SubElement(imprint, "pubDate")
				85	pubDateYear.set("type", "year")
				86	pubDateYear.text = dates.text
				87	pubDateMonth = ET.SubElement(imprint, "pubDate")
				88	pubDateMonth.set("type", "month")
				89	#pubDateMonth.text = datesPublished[i].text
				90	pubDateDay = ET.SubElement(imprint, "pubDate")
				91	pubDateDay.set("type", "day")
				92	#pubDateDay.text = datesPublished[i].text
				93	pubPlace = ET.SubElement(imprint, "pubPlace")
				94	idno = ET.SubElement(pubPlace, "idno")
				95	idno.set("type", "URI")
				96	idno.text = uris.text
				97	domain = ET.SubElement(titleStmt, "domain")
				98	domain.text = domains.get("type")
Marc Kupietz	1d9fd6b	2023-05-23 17:54:20 +0200	[diff] [blame]	99	splitFName = file.split("/")
				100	if (splitFName[-2] in ['Persuasive', 'Blog']):
				101	domain.text = splitFName[-2]
Marc Kupietz	7f9a493	2023-05-25 10:58:15 +0200	[diff] [blame]	102	elif(splitFName[-4] != "Originaldaten"):
				103	domain.text = splitFName[-4] + ':' + splitFName[-3] + ':' + splitFName[-2]
Marc Kupietz	1d9fd6b	2023-05-23 17:54:20 +0200	[diff] [blame]	104	else:
				105	domain.text = splitFName[-3] + ':' + splitFName[-2]
Marc Kupietz	7f9a493	2023-05-25 10:58:15 +0200	[diff] [blame]	106	domain.text = domain.text.replace("_hobbies", "_Hobbies")
lora-sp	2e2a21a	2023-03-30 13:54:59 +0200	[diff] [blame]	107	text = ET.SubElement(tei, "text")
				108	text.append(texts[i])
				109
				110	return tree
				111
				112
				113	main()