| lora-sp | 95b8f92 | 2023-04-06 11:28:19 +0200 | [diff] [blame^] | 1 | import sys | 
| lora-sp | 6697864 | 2023-03-08 11:02:52 +0100 | [diff] [blame] | 2 | import xml.etree.ElementTree as ET | 
| lora-sp | ab4e0ea | 2023-03-10 12:02:24 +0100 | [diff] [blame] | 3 |  | 
| lora-sp | 6697864 | 2023-03-08 11:02:52 +0100 | [diff] [blame] | 4 |  | 
| lora-sp | 4201a5e | 2023-03-09 16:19:57 +0100 | [diff] [blame] | 5 | def main(): | 
| lora-sp | 95b8f92 | 2023-04-06 11:28:19 +0200 | [diff] [blame^] | 6 |     corpusRoot = ET.Element("teiCorpus") | 
| lora-sp | 132c3e5 | 2023-03-09 16:32:37 +0100 | [diff] [blame] | 7 |  | 
| lora-sp | 09a58a0 | 2023-03-10 16:33:46 +0100 | [diff] [blame] | 8 |     for j in range(1, len(sys.argv)): | 
 | 9 |         try: | 
| lora-sp | 95b8f92 | 2023-04-06 11:28:19 +0200 | [diff] [blame^] | 10 |             doc_data = extract_data(sys.argv[j]) | 
 | 11 |             doc_tree = create_tree(doc_data, j-1) | 
 | 12 |             currentRoot = doc_tree.getroot() | 
| lora-sp | 09a58a0 | 2023-03-10 16:33:46 +0100 | [diff] [blame] | 13 |             corpusRoot.append(currentRoot) | 
 | 14 |         except: | 
| lora-sp | 95b8f92 | 2023-04-06 11:28:19 +0200 | [diff] [blame^] | 15 |             print("Warning: could not parse file: " + sys.argv[j], file=sys.stderr)  | 
| lora-sp | 09a58a0 | 2023-03-10 16:33:46 +0100 | [diff] [blame] | 16 |             continue | 
 | 17 |  | 
| lora-sp | 95b8f92 | 2023-04-06 11:28:19 +0200 | [diff] [blame^] | 18 |     corpusTree = ET.ElementTree(corpusRoot) | 
| lora-sp | 132c3e5 | 2023-03-09 16:32:37 +0100 | [diff] [blame] | 19 |     ET.indent(corpusTree, "  ") | 
| lora-sp | 95b8f92 | 2023-04-06 11:28:19 +0200 | [diff] [blame^] | 20 |     corpusTree.write(sys.stdout, encoding='unicode') | 
| lora-sp | 6697864 | 2023-03-08 11:02:52 +0100 | [diff] [blame] | 21 |  | 
| lora-sp | ab4e0ea | 2023-03-10 12:02:24 +0100 | [diff] [blame] | 22 |  | 
| lora-sp | 95b8f92 | 2023-04-06 11:28:19 +0200 | [diff] [blame^] | 23 | def extract_data(file): | 
 | 24 |     ''' Parses an xml file and saves the metadata and texts into a dictionary that is returned. | 
 | 25 |     The dictionary is of the following form: | 
 | 26 |     data = {filenumber: {title}, {url}, {author}, {date}, {time}, {text}} | 
 | 27 |     ''' | 
 | 28 |  | 
| lora-sp | 4201a5e | 2023-03-09 16:19:57 +0100 | [diff] [blame] | 29 |     tree = ET.parse(file) | 
 | 30 |     root = tree.getroot() | 
| lora-sp | 09a58a0 | 2023-03-10 16:33:46 +0100 | [diff] [blame] | 31 |     ET.register_namespace("", "http://www.tei-c.org/ns/1.0") | 
| lora-sp | ab4e0ea | 2023-03-10 12:02:24 +0100 | [diff] [blame] | 32 |  | 
| lora-sp | 95b8f92 | 2023-04-06 11:28:19 +0200 | [diff] [blame^] | 33 |     data = {} | 
| lora-sp | 8251163 | 2023-03-09 09:12:17 +0100 | [diff] [blame] | 34 |  | 
| lora-sp | 95b8f92 | 2023-04-06 11:28:19 +0200 | [diff] [blame^] | 35 |     for i, text in enumerate(root.iter("{http://www.tei-c.org/ns/1.0}text")): | 
 | 36 |         data[i] = {} | 
 | 37 |         data[i]['title'] = text.get('title') | 
 | 38 |         data[i]['url'] = text.get('url') | 
 | 39 |         data[i]['author'] = text.get('author') | 
 | 40 |         data[i]['date'] = text.get('date').split(' ')[0] | 
 | 41 |         data[i]['time'] = text.get('date').split(' ')[1] | 
| lora-sp | ab4e0ea | 2023-03-10 12:02:24 +0100 | [diff] [blame] | 42 |  | 
| lora-sp | 95b8f92 | 2023-04-06 11:28:19 +0200 | [diff] [blame^] | 43 |         textelem = text.find(".{http://www.tei-c.org/ns/1.0}body/{http://www.tei-c.org/ns/1.0}div/"\ | 
 | 44 |                              "{http://www.tei-c.org/ns/1.0}div") | 
 | 45 |      | 
 | 46 |         data[i]['text'] = [] | 
 | 47 |         for p in textelem.findall(".{http://www.tei-c.org/ns/1.0}p"): | 
 | 48 |             data[i]['text'].append(p) | 
 | 49 |      | 
 | 50 |     return data | 
| lora-sp | ab4e0ea | 2023-03-10 12:02:24 +0100 | [diff] [blame] | 51 |  | 
| lora-sp | ab4e0ea | 2023-03-10 12:02:24 +0100 | [diff] [blame] | 52 |  | 
| lora-sp | 95b8f92 | 2023-04-06 11:28:19 +0200 | [diff] [blame^] | 53 | def create_tree(data, filenumber): | 
 | 54 |     ''' Receives a dictionary containing the data and returns an xml tree in the  | 
 | 55 |     desired format. Generates text sigles of the following format: BNC/filenumber.textnumber, | 
 | 56 |     e.g. BNC/000.00000 | 
 | 57 |     ''' | 
 | 58 |     docRoot = ET.Element("teiDoc") | 
 | 59 |  | 
 | 60 |     for i in range(len(data)): | 
 | 61 |         tei = ET.SubElement(docRoot, "TEI") | 
| lora-sp | 4201a5e | 2023-03-09 16:19:57 +0100 | [diff] [blame] | 62 |         teiHeader = ET.SubElement(tei, "teiHeader") | 
 | 63 |         fileDesc = ET.SubElement(teiHeader, "fileDesc") | 
 | 64 |         titleStmt = ET.SubElement(fileDesc, "titleStmt") | 
 | 65 |         textSigle = ET.SubElement(titleStmt, "textSigle") | 
| lora-sp | 95b8f92 | 2023-04-06 11:28:19 +0200 | [diff] [blame^] | 66 |         textSigle.text = "BNC/" + f"{filenumber:03}" + "." + f"{i:05}" | 
| lora-sp | 4201a5e | 2023-03-09 16:19:57 +0100 | [diff] [blame] | 67 |         sourceDesc = ET.SubElement(fileDesc, "sourceDesc") | 
 | 68 |         analytic = ET.SubElement(sourceDesc, "analytic") | 
 | 69 |         htitle = ET.SubElement(analytic, "h.title") | 
| lora-sp | 95b8f92 | 2023-04-06 11:28:19 +0200 | [diff] [blame^] | 70 |         htitle.text = data[i]['title'] | 
| lora-sp | 4201a5e | 2023-03-09 16:19:57 +0100 | [diff] [blame] | 71 |         hauthor = ET.SubElement(analytic, "h.author") | 
| lora-sp | 95b8f92 | 2023-04-06 11:28:19 +0200 | [diff] [blame^] | 72 |         hauthor.text = data[i]['author'] | 
| lora-sp | 4201a5e | 2023-03-09 16:19:57 +0100 | [diff] [blame] | 73 |         imprint = ET.SubElement(sourceDesc, "imprint") | 
 | 74 |         pubDateYear = ET.SubElement(imprint, "pubDate") | 
 | 75 |         pubDateYear.set("type", "year") | 
| lora-sp | 95b8f92 | 2023-04-06 11:28:19 +0200 | [diff] [blame^] | 76 |         pubDateYear.text = data[i]['date'][0:4] | 
| lora-sp | 4201a5e | 2023-03-09 16:19:57 +0100 | [diff] [blame] | 77 |         pubDateMonth = ET.SubElement(imprint, "pubDate") | 
 | 78 |         pubDateMonth.set("type", "month") | 
| lora-sp | 95b8f92 | 2023-04-06 11:28:19 +0200 | [diff] [blame^] | 79 |         pubDateMonth.text = data[i]['date'][5:7] | 
| lora-sp | 4201a5e | 2023-03-09 16:19:57 +0100 | [diff] [blame] | 80 |         pubDateDay = ET.SubElement(imprint, "pubDate") | 
 | 81 |         pubDateDay.set("type", "day") | 
| lora-sp | 95b8f92 | 2023-04-06 11:28:19 +0200 | [diff] [blame^] | 82 |         pubDateDay.text = data[i]['date'][8:10] | 
| lora-sp | 4201a5e | 2023-03-09 16:19:57 +0100 | [diff] [blame] | 83 |         pubDateTime = ET.SubElement(imprint, "pubDate") | 
 | 84 |         pubDateTime.set("type", "time") | 
| lora-sp | 95b8f92 | 2023-04-06 11:28:19 +0200 | [diff] [blame^] | 85 |         pubDateTime.text = data[i]['time'] | 
| lora-sp | 4201a5e | 2023-03-09 16:19:57 +0100 | [diff] [blame] | 86 |         pubPlace = ET.SubElement(imprint, "pubPlace") | 
 | 87 |         ref = ET.SubElement(pubPlace, "ref") | 
 | 88 |         ref.set("type", "page_url") | 
| lora-sp | 95b8f92 | 2023-04-06 11:28:19 +0200 | [diff] [blame^] | 89 |         ref.set("target", data[i]['url']) | 
| lora-sp | 4201a5e | 2023-03-09 16:19:57 +0100 | [diff] [blame] | 90 |         text = ET.SubElement(tei, "text") | 
 | 91 |         body = ET.SubElement(text, "body") | 
| lora-sp | 95b8f92 | 2023-04-06 11:28:19 +0200 | [diff] [blame^] | 92 |         for p in data[i]['text']: | 
| lora-sp | 4201a5e | 2023-03-09 16:19:57 +0100 | [diff] [blame] | 93 |             body.append(p) | 
| lora-sp | 8251163 | 2023-03-09 09:12:17 +0100 | [diff] [blame] | 94 |  | 
| lora-sp | 95b8f92 | 2023-04-06 11:28:19 +0200 | [diff] [blame^] | 95 |     docTree = ET.ElementTree(docRoot) | 
 | 96 |     ET.indent(docTree, "  ") | 
 | 97 |  | 
 | 98 |     return docTree | 
| lora-sp | 132c3e5 | 2023-03-09 16:32:37 +0100 | [diff] [blame] | 99 |  | 
| lora-sp | 132c3e5 | 2023-03-09 16:32:37 +0100 | [diff] [blame] | 100 |  | 
| lora-sp | 95b8f92 | 2023-04-06 11:28:19 +0200 | [diff] [blame^] | 101 | main() |