blob: 830e71592b9e9fc8fed19f7f24fd6ffad0113326 [file] [log] [blame]
lora-sp95b8f922023-04-06 11:28:19 +02001import sys
lora-sp66978642023-03-08 11:02:52 +01002import xml.etree.ElementTree as ET
lora-spab4e0ea2023-03-10 12:02:24 +01003
lora-sp66978642023-03-08 11:02:52 +01004
lora-sp4201a5e2023-03-09 16:19:57 +01005def main():
lora-sp95b8f922023-04-06 11:28:19 +02006 corpusRoot = ET.Element("teiCorpus")
lora-sp132c3e52023-03-09 16:32:37 +01007
lora-sp09a58a02023-03-10 16:33:46 +01008 for j in range(1, len(sys.argv)):
9 try:
lora-sp95b8f922023-04-06 11:28:19 +020010 doc_data = extract_data(sys.argv[j])
11 doc_tree = create_tree(doc_data, j-1)
12 currentRoot = doc_tree.getroot()
lora-sp09a58a02023-03-10 16:33:46 +010013 corpusRoot.append(currentRoot)
14 except:
lora-sp95b8f922023-04-06 11:28:19 +020015 print("Warning: could not parse file: " + sys.argv[j], file=sys.stderr)
lora-sp09a58a02023-03-10 16:33:46 +010016 continue
17
lora-sp95b8f922023-04-06 11:28:19 +020018 corpusTree = ET.ElementTree(corpusRoot)
lora-sp132c3e52023-03-09 16:32:37 +010019 ET.indent(corpusTree, " ")
lora-sp95b8f922023-04-06 11:28:19 +020020 corpusTree.write(sys.stdout, encoding='unicode')
lora-sp66978642023-03-08 11:02:52 +010021
lora-spab4e0ea2023-03-10 12:02:24 +010022
lora-sp95b8f922023-04-06 11:28:19 +020023def extract_data(file):
24 ''' Parses an xml file and saves the metadata and texts into a dictionary that is returned.
25 The dictionary is of the following form:
26 data = {filenumber: {title}, {url}, {author}, {date}, {time}, {text}}
27 '''
28
lora-sp4201a5e2023-03-09 16:19:57 +010029 tree = ET.parse(file)
30 root = tree.getroot()
lora-sp09a58a02023-03-10 16:33:46 +010031 ET.register_namespace("", "http://www.tei-c.org/ns/1.0")
lora-spab4e0ea2023-03-10 12:02:24 +010032
lora-sp95b8f922023-04-06 11:28:19 +020033 data = {}
lora-sp82511632023-03-09 09:12:17 +010034
lora-sp95b8f922023-04-06 11:28:19 +020035 for i, text in enumerate(root.iter("{http://www.tei-c.org/ns/1.0}text")):
36 data[i] = {}
37 data[i]['title'] = text.get('title')
38 data[i]['url'] = text.get('url')
lora-sp2ed76502023-04-06 18:12:27 +020039 data[i]['author'] = text.get('author')
lora-sp95b8f922023-04-06 11:28:19 +020040 data[i]['date'] = text.get('date').split(' ')[0]
41 data[i]['time'] = text.get('date').split(' ')[1]
lora-spab4e0ea2023-03-10 12:02:24 +010042
lora-sp95b8f922023-04-06 11:28:19 +020043 textelem = text.find(".{http://www.tei-c.org/ns/1.0}body/{http://www.tei-c.org/ns/1.0}div/"\
44 "{http://www.tei-c.org/ns/1.0}div")
45
46 data[i]['text'] = []
47 for p in textelem.findall(".{http://www.tei-c.org/ns/1.0}p"):
48 data[i]['text'].append(p)
49
50 return data
lora-spab4e0ea2023-03-10 12:02:24 +010051
lora-spab4e0ea2023-03-10 12:02:24 +010052
lora-sp95b8f922023-04-06 11:28:19 +020053def create_tree(data, filenumber):
54 ''' Receives a dictionary containing the data and returns an xml tree in the
55 desired format. Generates text sigles of the following format: BNC/filenumber.textnumber,
56 e.g. BNC/000.00000
57 '''
58 docRoot = ET.Element("teiDoc")
59
60 for i in range(len(data)):
61 tei = ET.SubElement(docRoot, "TEI")
lora-sp4201a5e2023-03-09 16:19:57 +010062 teiHeader = ET.SubElement(tei, "teiHeader")
63 fileDesc = ET.SubElement(teiHeader, "fileDesc")
64 titleStmt = ET.SubElement(fileDesc, "titleStmt")
65 textSigle = ET.SubElement(titleStmt, "textSigle")
lora-sp95b8f922023-04-06 11:28:19 +020066 textSigle.text = "BNC/" + f"{filenumber:03}" + "." + f"{i:05}"
lora-sp4201a5e2023-03-09 16:19:57 +010067 sourceDesc = ET.SubElement(fileDesc, "sourceDesc")
68 analytic = ET.SubElement(sourceDesc, "analytic")
69 htitle = ET.SubElement(analytic, "h.title")
lora-sp95b8f922023-04-06 11:28:19 +020070 htitle.text = data[i]['title']
lora-sp4201a5e2023-03-09 16:19:57 +010071 hauthor = ET.SubElement(analytic, "h.author")
lora-sp95b8f922023-04-06 11:28:19 +020072 hauthor.text = data[i]['author']
lora-sp4201a5e2023-03-09 16:19:57 +010073 imprint = ET.SubElement(sourceDesc, "imprint")
74 pubDateYear = ET.SubElement(imprint, "pubDate")
75 pubDateYear.set("type", "year")
lora-sp95b8f922023-04-06 11:28:19 +020076 pubDateYear.text = data[i]['date'][0:4]
lora-sp4201a5e2023-03-09 16:19:57 +010077 pubDateMonth = ET.SubElement(imprint, "pubDate")
78 pubDateMonth.set("type", "month")
lora-sp95b8f922023-04-06 11:28:19 +020079 pubDateMonth.text = data[i]['date'][5:7]
lora-sp4201a5e2023-03-09 16:19:57 +010080 pubDateDay = ET.SubElement(imprint, "pubDate")
81 pubDateDay.set("type", "day")
lora-sp95b8f922023-04-06 11:28:19 +020082 pubDateDay.text = data[i]['date'][8:10]
lora-sp4201a5e2023-03-09 16:19:57 +010083 pubDateTime = ET.SubElement(imprint, "pubDate")
84 pubDateTime.set("type", "time")
lora-sp95b8f922023-04-06 11:28:19 +020085 pubDateTime.text = data[i]['time']
lora-sp4201a5e2023-03-09 16:19:57 +010086 pubPlace = ET.SubElement(imprint, "pubPlace")
87 ref = ET.SubElement(pubPlace, "ref")
88 ref.set("type", "page_url")
lora-sp95b8f922023-04-06 11:28:19 +020089 ref.set("target", data[i]['url'])
lora-sp4201a5e2023-03-09 16:19:57 +010090 text = ET.SubElement(tei, "text")
91 body = ET.SubElement(text, "body")
lora-sp95b8f922023-04-06 11:28:19 +020092 for p in data[i]['text']:
lora-sp4201a5e2023-03-09 16:19:57 +010093 body.append(p)
lora-sp82511632023-03-09 09:12:17 +010094
lora-sp95b8f922023-04-06 11:28:19 +020095 docTree = ET.ElementTree(docRoot)
96 ET.indent(docTree, " ")
97
98 return docTree
lora-sp132c3e52023-03-09 16:32:37 +010099
lora-sp132c3e52023-03-09 16:32:37 +0100100
lora-sp95b8f922023-04-06 11:28:19 +0200101main()