lora-sp | 95b8f92 | 2023-04-06 11:28:19 +0200 | [diff] [blame] | 1 | import sys |
lora-sp | 6697864 | 2023-03-08 11:02:52 +0100 | [diff] [blame] | 2 | import xml.etree.ElementTree as ET |
lora-sp | ab4e0ea | 2023-03-10 12:02:24 +0100 | [diff] [blame] | 3 | |
lora-sp | 6697864 | 2023-03-08 11:02:52 +0100 | [diff] [blame] | 4 | |
lora-sp | 4201a5e | 2023-03-09 16:19:57 +0100 | [diff] [blame] | 5 | def main(): |
lora-sp | 95b8f92 | 2023-04-06 11:28:19 +0200 | [diff] [blame] | 6 | corpusRoot = ET.Element("teiCorpus") |
lora-sp | 132c3e5 | 2023-03-09 16:32:37 +0100 | [diff] [blame] | 7 | |
lora-sp | 09a58a0 | 2023-03-10 16:33:46 +0100 | [diff] [blame] | 8 | for j in range(1, len(sys.argv)): |
| 9 | try: |
lora-sp | 95b8f92 | 2023-04-06 11:28:19 +0200 | [diff] [blame] | 10 | doc_data = extract_data(sys.argv[j]) |
| 11 | doc_tree = create_tree(doc_data, j-1) |
| 12 | currentRoot = doc_tree.getroot() |
lora-sp | 09a58a0 | 2023-03-10 16:33:46 +0100 | [diff] [blame] | 13 | corpusRoot.append(currentRoot) |
| 14 | except: |
lora-sp | 95b8f92 | 2023-04-06 11:28:19 +0200 | [diff] [blame] | 15 | print("Warning: could not parse file: " + sys.argv[j], file=sys.stderr) |
lora-sp | 09a58a0 | 2023-03-10 16:33:46 +0100 | [diff] [blame] | 16 | continue |
| 17 | |
lora-sp | 95b8f92 | 2023-04-06 11:28:19 +0200 | [diff] [blame] | 18 | corpusTree = ET.ElementTree(corpusRoot) |
lora-sp | 132c3e5 | 2023-03-09 16:32:37 +0100 | [diff] [blame] | 19 | ET.indent(corpusTree, " ") |
lora-sp | 95b8f92 | 2023-04-06 11:28:19 +0200 | [diff] [blame] | 20 | corpusTree.write(sys.stdout, encoding='unicode') |
lora-sp | 6697864 | 2023-03-08 11:02:52 +0100 | [diff] [blame] | 21 | |
lora-sp | ab4e0ea | 2023-03-10 12:02:24 +0100 | [diff] [blame] | 22 | |
lora-sp | 95b8f92 | 2023-04-06 11:28:19 +0200 | [diff] [blame] | 23 | def extract_data(file): |
| 24 | ''' Parses an xml file and saves the metadata and texts into a dictionary that is returned. |
| 25 | The dictionary is of the following form: |
| 26 | data = {filenumber: {title}, {url}, {author}, {date}, {time}, {text}} |
| 27 | ''' |
| 28 | |
lora-sp | 4201a5e | 2023-03-09 16:19:57 +0100 | [diff] [blame] | 29 | tree = ET.parse(file) |
| 30 | root = tree.getroot() |
lora-sp | 09a58a0 | 2023-03-10 16:33:46 +0100 | [diff] [blame] | 31 | ET.register_namespace("", "http://www.tei-c.org/ns/1.0") |
lora-sp | ab4e0ea | 2023-03-10 12:02:24 +0100 | [diff] [blame] | 32 | |
lora-sp | 95b8f92 | 2023-04-06 11:28:19 +0200 | [diff] [blame] | 33 | data = {} |
lora-sp | 8251163 | 2023-03-09 09:12:17 +0100 | [diff] [blame] | 34 | |
lora-sp | 95b8f92 | 2023-04-06 11:28:19 +0200 | [diff] [blame] | 35 | for i, text in enumerate(root.iter("{http://www.tei-c.org/ns/1.0}text")): |
| 36 | data[i] = {} |
| 37 | data[i]['title'] = text.get('title') |
| 38 | data[i]['url'] = text.get('url') |
lora-sp | 2ed7650 | 2023-04-06 18:12:27 +0200 | [diff] [blame] | 39 | data[i]['author'] = text.get('author') |
lora-sp | 95b8f92 | 2023-04-06 11:28:19 +0200 | [diff] [blame] | 40 | data[i]['date'] = text.get('date').split(' ')[0] |
| 41 | data[i]['time'] = text.get('date').split(' ')[1] |
lora-sp | ab4e0ea | 2023-03-10 12:02:24 +0100 | [diff] [blame] | 42 | |
lora-sp | 95b8f92 | 2023-04-06 11:28:19 +0200 | [diff] [blame] | 43 | textelem = text.find(".{http://www.tei-c.org/ns/1.0}body/{http://www.tei-c.org/ns/1.0}div/"\ |
| 44 | "{http://www.tei-c.org/ns/1.0}div") |
| 45 | |
| 46 | data[i]['text'] = [] |
| 47 | for p in textelem.findall(".{http://www.tei-c.org/ns/1.0}p"): |
| 48 | data[i]['text'].append(p) |
| 49 | |
| 50 | return data |
lora-sp | ab4e0ea | 2023-03-10 12:02:24 +0100 | [diff] [blame] | 51 | |
lora-sp | ab4e0ea | 2023-03-10 12:02:24 +0100 | [diff] [blame] | 52 | |
lora-sp | 95b8f92 | 2023-04-06 11:28:19 +0200 | [diff] [blame] | 53 | def create_tree(data, filenumber): |
| 54 | ''' Receives a dictionary containing the data and returns an xml tree in the |
| 55 | desired format. Generates text sigles of the following format: BNC/filenumber.textnumber, |
| 56 | e.g. BNC/000.00000 |
| 57 | ''' |
| 58 | docRoot = ET.Element("teiDoc") |
| 59 | |
| 60 | for i in range(len(data)): |
| 61 | tei = ET.SubElement(docRoot, "TEI") |
lora-sp | 4201a5e | 2023-03-09 16:19:57 +0100 | [diff] [blame] | 62 | teiHeader = ET.SubElement(tei, "teiHeader") |
| 63 | fileDesc = ET.SubElement(teiHeader, "fileDesc") |
| 64 | titleStmt = ET.SubElement(fileDesc, "titleStmt") |
| 65 | textSigle = ET.SubElement(titleStmt, "textSigle") |
lora-sp | 95b8f92 | 2023-04-06 11:28:19 +0200 | [diff] [blame] | 66 | textSigle.text = "BNC/" + f"{filenumber:03}" + "." + f"{i:05}" |
lora-sp | 4201a5e | 2023-03-09 16:19:57 +0100 | [diff] [blame] | 67 | sourceDesc = ET.SubElement(fileDesc, "sourceDesc") |
| 68 | analytic = ET.SubElement(sourceDesc, "analytic") |
| 69 | htitle = ET.SubElement(analytic, "h.title") |
lora-sp | 95b8f92 | 2023-04-06 11:28:19 +0200 | [diff] [blame] | 70 | htitle.text = data[i]['title'] |
lora-sp | 4201a5e | 2023-03-09 16:19:57 +0100 | [diff] [blame] | 71 | hauthor = ET.SubElement(analytic, "h.author") |
lora-sp | 95b8f92 | 2023-04-06 11:28:19 +0200 | [diff] [blame] | 72 | hauthor.text = data[i]['author'] |
lora-sp | 4201a5e | 2023-03-09 16:19:57 +0100 | [diff] [blame] | 73 | imprint = ET.SubElement(sourceDesc, "imprint") |
| 74 | pubDateYear = ET.SubElement(imprint, "pubDate") |
| 75 | pubDateYear.set("type", "year") |
lora-sp | 95b8f92 | 2023-04-06 11:28:19 +0200 | [diff] [blame] | 76 | pubDateYear.text = data[i]['date'][0:4] |
lora-sp | 4201a5e | 2023-03-09 16:19:57 +0100 | [diff] [blame] | 77 | pubDateMonth = ET.SubElement(imprint, "pubDate") |
| 78 | pubDateMonth.set("type", "month") |
lora-sp | 95b8f92 | 2023-04-06 11:28:19 +0200 | [diff] [blame] | 79 | pubDateMonth.text = data[i]['date'][5:7] |
lora-sp | 4201a5e | 2023-03-09 16:19:57 +0100 | [diff] [blame] | 80 | pubDateDay = ET.SubElement(imprint, "pubDate") |
| 81 | pubDateDay.set("type", "day") |
lora-sp | 95b8f92 | 2023-04-06 11:28:19 +0200 | [diff] [blame] | 82 | pubDateDay.text = data[i]['date'][8:10] |
lora-sp | 4201a5e | 2023-03-09 16:19:57 +0100 | [diff] [blame] | 83 | pubDateTime = ET.SubElement(imprint, "pubDate") |
| 84 | pubDateTime.set("type", "time") |
lora-sp | 95b8f92 | 2023-04-06 11:28:19 +0200 | [diff] [blame] | 85 | pubDateTime.text = data[i]['time'] |
lora-sp | 4201a5e | 2023-03-09 16:19:57 +0100 | [diff] [blame] | 86 | pubPlace = ET.SubElement(imprint, "pubPlace") |
| 87 | ref = ET.SubElement(pubPlace, "ref") |
| 88 | ref.set("type", "page_url") |
lora-sp | 95b8f92 | 2023-04-06 11:28:19 +0200 | [diff] [blame] | 89 | ref.set("target", data[i]['url']) |
lora-sp | 4201a5e | 2023-03-09 16:19:57 +0100 | [diff] [blame] | 90 | text = ET.SubElement(tei, "text") |
| 91 | body = ET.SubElement(text, "body") |
lora-sp | 95b8f92 | 2023-04-06 11:28:19 +0200 | [diff] [blame] | 92 | for p in data[i]['text']: |
lora-sp | 4201a5e | 2023-03-09 16:19:57 +0100 | [diff] [blame] | 93 | body.append(p) |
lora-sp | 8251163 | 2023-03-09 09:12:17 +0100 | [diff] [blame] | 94 | |
lora-sp | 95b8f92 | 2023-04-06 11:28:19 +0200 | [diff] [blame] | 95 | docTree = ET.ElementTree(docRoot) |
| 96 | ET.indent(docTree, " ") |
| 97 | |
| 98 | return docTree |
lora-sp | 132c3e5 | 2023-03-09 16:32:37 +0100 | [diff] [blame] | 99 | |
lora-sp | 132c3e5 | 2023-03-09 16:32:37 +0100 | [diff] [blame] | 100 | |
lora-sp | 95b8f92 | 2023-04-06 11:28:19 +0200 | [diff] [blame] | 101 | main() |