| lora-sp | ab4e0ea | 2023-03-10 12:02:24 +0100 | [diff] [blame^] | 1 | import os, sys | 
| lora-sp | 6697864 | 2023-03-08 11:02:52 +0100 | [diff] [blame] | 2 | import xml.etree.ElementTree as ET | 
| lora-sp | ab4e0ea | 2023-03-10 12:02:24 +0100 | [diff] [blame^] | 3 | from xml.dom import minidom | 
|  | 4 |  | 
| lora-sp | 6697864 | 2023-03-08 11:02:52 +0100 | [diff] [blame] | 5 |  | 
| lora-sp | 4201a5e | 2023-03-09 16:19:57 +0100 | [diff] [blame] | 6 | def main(): | 
| lora-sp | ab4e0ea | 2023-03-10 12:02:24 +0100 | [diff] [blame^] | 7 | # Create corpus structure from string and save into file | 
|  | 8 | corpus = "<teiCorpus>\n</teiCorpus>" | 
|  | 9 | origRoot = ET.fromstring(corpus) | 
|  | 10 | corpusStr = minidom.parseString(ET.tostring(origRoot)).toprettyxml(indent="  ") | 
|  | 11 | with open("tree_structure.xml", "w") as f: | 
|  | 12 | f.write(corpusStr) | 
| lora-sp | 132c3e5 | 2023-03-09 16:32:37 +0100 | [diff] [blame] | 13 |  | 
| lora-sp | ab4e0ea | 2023-03-10 12:02:24 +0100 | [diff] [blame^] | 14 | # Process all documents and append to corpusTree | 
|  | 15 | #path = "./BGCorpusExamples/" | 
|  | 16 | #files = os.listdir(path) | 
|  | 17 | process(0, sys.argv[1]) | 
|  | 18 | #process(sys.argv[2]) | 
|  | 19 | # Parse corpus tree, indent and output | 
|  | 20 | corpusTree = ET.parse("tree_structure.xml") | 
| lora-sp | 132c3e5 | 2023-03-09 16:32:37 +0100 | [diff] [blame] | 21 | ET.indent(corpusTree, "  ") | 
| lora-sp | ab4e0ea | 2023-03-10 12:02:24 +0100 | [diff] [blame^] | 22 | corpusTree.write("output.p5.xml", encoding='utf-8', xml_declaration=True, method='xml', short_empty_elements=True) | 
| lora-sp | 6697864 | 2023-03-08 11:02:52 +0100 | [diff] [blame] | 23 |  | 
| lora-sp | ab4e0ea | 2023-03-10 12:02:24 +0100 | [diff] [blame^] | 24 |  | 
|  | 25 | def process(j, file): | 
|  | 26 | #j = 0 | 
|  | 27 | # Parse corpus tree and get corpus root | 
|  | 28 | corpusTree = ET.parse("tree_structure.xml") | 
|  | 29 | corpusRoot = corpusTree.getroot() | 
|  | 30 |  | 
|  | 31 | # Parse document tree and get root | 
| lora-sp | 4201a5e | 2023-03-09 16:19:57 +0100 | [diff] [blame] | 32 | tree = ET.parse(file) | 
|  | 33 | root = tree.getroot() | 
| lora-sp | ab4e0ea | 2023-03-10 12:02:24 +0100 | [diff] [blame^] | 34 |  | 
|  | 35 | # Store metadata and texts in lists | 
| lora-sp | 4201a5e | 2023-03-09 16:19:57 +0100 | [diff] [blame] | 36 | titles = root.findall(".//*[@type='title']") | 
| lora-sp | ab4e0ea | 2023-03-10 12:02:24 +0100 | [diff] [blame^] | 37 | #domains = root.findall(".//*[@type='domain']") | 
| lora-sp | 4201a5e | 2023-03-09 16:19:57 +0100 | [diff] [blame] | 38 | pageURLs = root.findall(".//*[@type='pageURL']") | 
| lora-sp | ab4e0ea | 2023-03-10 12:02:24 +0100 | [diff] [blame^] | 39 | #ids = root.findall(".//*[@type='id']") | 
|  | 40 | #mainImageURLs = root.findall(".//*[@type='mainImageURL']") | 
|  | 41 | #mainImageTexts = root.findall(".//*[@type='mainImageTexts']") | 
|  | 42 | #mainImageSources = root.findall(".//*[@type='mainImageSources']") | 
| lora-sp | 4201a5e | 2023-03-09 16:19:57 +0100 | [diff] [blame] | 43 | authors = root.findall(".//*[@type='authors']") | 
| lora-sp | ab4e0ea | 2023-03-10 12:02:24 +0100 | [diff] [blame^] | 44 | #authorURLs = root.findall(".//*[@type='authorURLs']") | 
|  | 45 | #categories = root.findall(".//*[@type='category']") | 
|  | 46 | #subCategories = root.findall(".//*[@type='subCategory']") | 
|  | 47 | #tags = root.findall(".//*[@type='tags']") | 
| lora-sp | 4201a5e | 2023-03-09 16:19:57 +0100 | [diff] [blame] | 48 | datesPublished = root.findall(".//*[@type='datePublished']") | 
|  | 49 | timesPublished = root.findall(".//*[@type='timePublished']") | 
| lora-sp | ab4e0ea | 2023-03-10 12:02:24 +0100 | [diff] [blame^] | 50 | #datesModified = root.findall(".//*[@type='dateModified']") | 
|  | 51 | #timesModified = root.findall(".//*[@type='timeModified']") | 
|  | 52 | #mainImageWidths = root.findall(".//*[@type='mainImageWidth']") | 
|  | 53 | #mainImageHeights = root.findall(".//*[@type='mainImageHeight']") | 
|  | 54 | #mainImageThumbnailURLs = root.findall(".//*[@type='mainImageThumbnailURL']") | 
| lora-sp | 4201a5e | 2023-03-09 16:19:57 +0100 | [diff] [blame] | 55 | texts = [] | 
| lora-sp | ab4e0ea | 2023-03-10 12:02:24 +0100 | [diff] [blame^] | 56 |  | 
|  | 57 | # Count text elements and remove metadata | 
| lora-sp | 4201a5e | 2023-03-09 16:19:57 +0100 | [diff] [blame] | 58 | number_of_texts = 0 | 
|  | 59 | for text in root.iter("{http://www.tei-c.org/ns/1.0}text"): | 
|  | 60 | number_of_texts+=1 | 
|  | 61 | for body in text: | 
|  | 62 | for div1 in body: | 
|  | 63 | for div2 in div1: | 
|  | 64 | for div3 in div2: | 
|  | 65 | if div3.get('type') == "metadata": | 
|  | 66 | div2.remove(div3) | 
| lora-sp | 8251163 | 2023-03-09 09:12:17 +0100 | [diff] [blame] | 67 |  | 
| lora-sp | 4201a5e | 2023-03-09 16:19:57 +0100 | [diff] [blame] | 68 | texts.append(div2) | 
| lora-sp | ab4e0ea | 2023-03-10 12:02:24 +0100 | [diff] [blame^] | 69 |  | 
|  | 70 | # Remove all elements from root | 
| lora-sp | 4201a5e | 2023-03-09 16:19:57 +0100 | [diff] [blame] | 71 | for elem in root.findall("*"): | 
|  | 72 | root.remove(elem) | 
| lora-sp | ab4e0ea | 2023-03-10 12:02:24 +0100 | [diff] [blame^] | 73 |  | 
|  | 74 | # Rename root | 
| lora-sp | 4201a5e | 2023-03-09 16:19:57 +0100 | [diff] [blame] | 75 | root.tag = "teiDoc" | 
| lora-sp | ab4e0ea | 2023-03-10 12:02:24 +0100 | [diff] [blame^] | 76 |  | 
|  | 77 | # Create target structure | 
| lora-sp | 4201a5e | 2023-03-09 16:19:57 +0100 | [diff] [blame] | 78 | for i in range(number_of_texts): | 
|  | 79 | tei = ET.SubElement(root, "TEI") | 
|  | 80 | teiHeader = ET.SubElement(tei, "teiHeader") | 
|  | 81 | fileDesc = ET.SubElement(teiHeader, "fileDesc") | 
|  | 82 | titleStmt = ET.SubElement(fileDesc, "titleStmt") | 
|  | 83 | textSigle = ET.SubElement(titleStmt, "textSigle") | 
| lora-sp | ab4e0ea | 2023-03-10 12:02:24 +0100 | [diff] [blame^] | 84 | textSigle.text = "BNC/" + f"{j:03}" + "." + f"{i:05}" | 
| lora-sp | 4201a5e | 2023-03-09 16:19:57 +0100 | [diff] [blame] | 85 | sourceDesc = ET.SubElement(fileDesc, "sourceDesc") | 
|  | 86 | analytic = ET.SubElement(sourceDesc, "analytic") | 
|  | 87 | htitle = ET.SubElement(analytic, "h.title") | 
|  | 88 | htitle.text = titles[i].text | 
|  | 89 | hauthor = ET.SubElement(analytic, "h.author") | 
|  | 90 | hauthor.text = authors[i].text | 
|  | 91 | imprint = ET.SubElement(sourceDesc, "imprint") | 
|  | 92 | pubDateYear = ET.SubElement(imprint, "pubDate") | 
|  | 93 | pubDateYear.set("type", "year") | 
|  | 94 | pubDateYear.text = datesPublished[i].text[0:4] | 
|  | 95 | pubDateMonth = ET.SubElement(imprint, "pubDate") | 
|  | 96 | pubDateMonth.set("type", "month") | 
|  | 97 | pubDateMonth.text = datesPublished[i].text[5:7] | 
|  | 98 | pubDateDay = ET.SubElement(imprint, "pubDate") | 
|  | 99 | pubDateDay.set("type", "day") | 
|  | 100 | pubDateDay.text = datesPublished[i].text[8:10] | 
|  | 101 | pubDateTime = ET.SubElement(imprint, "pubDate") | 
|  | 102 | pubDateTime.set("type", "time") | 
|  | 103 | pubDateTime.text = timesPublished[i].text | 
|  | 104 | pubPlace = ET.SubElement(imprint, "pubPlace") | 
|  | 105 | ref = ET.SubElement(pubPlace, "ref") | 
|  | 106 | ref.set("type", "page_url") | 
|  | 107 | ref.set("target", pageURLs[i].text) | 
|  | 108 | text = ET.SubElement(tei, "text") | 
|  | 109 | body = ET.SubElement(text, "body") | 
|  | 110 | for p in texts[i]: | 
|  | 111 | body.append(p) | 
| lora-sp | 8251163 | 2023-03-09 09:12:17 +0100 | [diff] [blame] | 112 |  | 
| lora-sp | 132c3e5 | 2023-03-09 16:32:37 +0100 | [diff] [blame] | 113 |  | 
|  | 114 | corpusRoot.append(root) | 
| lora-sp | 4201a5e | 2023-03-09 16:19:57 +0100 | [diff] [blame] | 115 | ET.register_namespace("", "http://www.tei-c.org/ns/1.0") | 
| lora-sp | 132c3e5 | 2023-03-09 16:32:37 +0100 | [diff] [blame] | 116 |  | 
| lora-sp | 132c3e5 | 2023-03-09 16:32:37 +0100 | [diff] [blame] | 117 |  | 
| lora-sp | ab4e0ea | 2023-03-10 12:02:24 +0100 | [diff] [blame^] | 118 | if __name__ == "__main__": | 
|  | 119 | main() |