lora-sp | ab4e0ea | 2023-03-10 12:02:24 +0100 | [diff] [blame] | 1 | import os, sys |
lora-sp | 6697864 | 2023-03-08 11:02:52 +0100 | [diff] [blame] | 2 | import xml.etree.ElementTree as ET |
lora-sp | ab4e0ea | 2023-03-10 12:02:24 +0100 | [diff] [blame] | 3 | from xml.dom import minidom |
lora-sp | 09a58a0 | 2023-03-10 16:33:46 +0100 | [diff] [blame] | 4 | from lxml import etree |
| 5 | from io import StringIO |
lora-sp | ab4e0ea | 2023-03-10 12:02:24 +0100 | [diff] [blame] | 6 | |
lora-sp | 6697864 | 2023-03-08 11:02:52 +0100 | [diff] [blame] | 7 | |
lora-sp | 4201a5e | 2023-03-09 16:19:57 +0100 | [diff] [blame] | 8 | def main(): |
lora-sp | ab4e0ea | 2023-03-10 12:02:24 +0100 | [diff] [blame] | 9 | # Create corpus structure from string and save into file |
| 10 | corpus = "<teiCorpus>\n</teiCorpus>" |
| 11 | origRoot = ET.fromstring(corpus) |
| 12 | corpusStr = minidom.parseString(ET.tostring(origRoot)).toprettyxml(indent=" ") |
lora-sp | 4eea8a6 | 2023-03-13 14:08:43 +0100 | [diff] [blame] | 13 | if not os.path.exists('input'): |
| 14 | os.makedirs("input") |
| 15 | if not os.path.exists('output'): |
| 16 | os.makedirs("output") |
| 17 | with open("input/tree_structure.xml", "w") as f: |
lora-sp | ab4e0ea | 2023-03-10 12:02:24 +0100 | [diff] [blame] | 18 | f.write(corpusStr) |
lora-sp | 132c3e5 | 2023-03-09 16:32:37 +0100 | [diff] [blame] | 19 | |
lora-sp | 09a58a0 | 2023-03-10 16:33:46 +0100 | [diff] [blame] | 20 | # Parse corpus tree |
lora-sp | 4eea8a6 | 2023-03-13 14:08:43 +0100 | [diff] [blame] | 21 | corpusTree = ET.parse("input/tree_structure.xml") |
lora-sp | 09a58a0 | 2023-03-10 16:33:46 +0100 | [diff] [blame] | 22 | corpusRoot = corpusTree.getroot() |
| 23 | |
| 24 | # Process documents and append to corpus tree |
| 25 | for j in range(1, len(sys.argv)): |
| 26 | try: |
lora-sp | fb3b5bc | 2023-03-13 11:59:34 +0100 | [diff] [blame] | 27 | currentTree = convert(j-1, sys.argv[j]) |
lora-sp | 09a58a0 | 2023-03-10 16:33:46 +0100 | [diff] [blame] | 28 | currentRoot = currentTree.getroot() |
| 29 | #print(currentRoot.tag) |
| 30 | corpusRoot.append(currentRoot) |
| 31 | except: |
lora-sp | a158640 | 2023-03-13 15:58:30 +0100 | [diff] [blame^] | 32 | print(sys.argv[j]) |
lora-sp | 09a58a0 | 2023-03-10 16:33:46 +0100 | [diff] [blame] | 33 | continue |
| 34 | |
| 35 | # Indent and save tree |
lora-sp | 132c3e5 | 2023-03-09 16:32:37 +0100 | [diff] [blame] | 36 | ET.indent(corpusTree, " ") |
lora-sp | 4eea8a6 | 2023-03-13 14:08:43 +0100 | [diff] [blame] | 37 | corpusTree.write("output/corpus.p5.xml", encoding='utf-8', xml_declaration=True, method='xml', short_empty_elements=True) |
lora-sp | 6697864 | 2023-03-08 11:02:52 +0100 | [diff] [blame] | 38 | |
lora-sp | ab4e0ea | 2023-03-10 12:02:24 +0100 | [diff] [blame] | 39 | |
lora-sp | fb3b5bc | 2023-03-13 11:59:34 +0100 | [diff] [blame] | 40 | def convert(j, file): |
lora-sp | ab4e0ea | 2023-03-10 12:02:24 +0100 | [diff] [blame] | 41 | # Parse document tree and get root |
lora-sp | 4201a5e | 2023-03-09 16:19:57 +0100 | [diff] [blame] | 42 | tree = ET.parse(file) |
| 43 | root = tree.getroot() |
lora-sp | 09a58a0 | 2023-03-10 16:33:46 +0100 | [diff] [blame] | 44 | ET.register_namespace("", "http://www.tei-c.org/ns/1.0") |
| 45 | |
lora-sp | ab4e0ea | 2023-03-10 12:02:24 +0100 | [diff] [blame] | 46 | # Store metadata and texts in lists |
lora-sp | 4201a5e | 2023-03-09 16:19:57 +0100 | [diff] [blame] | 47 | titles = root.findall(".//*[@type='title']") |
lora-sp | ab4e0ea | 2023-03-10 12:02:24 +0100 | [diff] [blame] | 48 | #domains = root.findall(".//*[@type='domain']") |
lora-sp | 4201a5e | 2023-03-09 16:19:57 +0100 | [diff] [blame] | 49 | pageURLs = root.findall(".//*[@type='pageURL']") |
lora-sp | ab4e0ea | 2023-03-10 12:02:24 +0100 | [diff] [blame] | 50 | #ids = root.findall(".//*[@type='id']") |
| 51 | #mainImageURLs = root.findall(".//*[@type='mainImageURL']") |
| 52 | #mainImageTexts = root.findall(".//*[@type='mainImageTexts']") |
| 53 | #mainImageSources = root.findall(".//*[@type='mainImageSources']") |
lora-sp | 4201a5e | 2023-03-09 16:19:57 +0100 | [diff] [blame] | 54 | authors = root.findall(".//*[@type='authors']") |
lora-sp | ab4e0ea | 2023-03-10 12:02:24 +0100 | [diff] [blame] | 55 | #authorURLs = root.findall(".//*[@type='authorURLs']") |
| 56 | #categories = root.findall(".//*[@type='category']") |
| 57 | #subCategories = root.findall(".//*[@type='subCategory']") |
| 58 | #tags = root.findall(".//*[@type='tags']") |
lora-sp | 4201a5e | 2023-03-09 16:19:57 +0100 | [diff] [blame] | 59 | datesPublished = root.findall(".//*[@type='datePublished']") |
| 60 | timesPublished = root.findall(".//*[@type='timePublished']") |
lora-sp | ab4e0ea | 2023-03-10 12:02:24 +0100 | [diff] [blame] | 61 | #datesModified = root.findall(".//*[@type='dateModified']") |
| 62 | #timesModified = root.findall(".//*[@type='timeModified']") |
| 63 | #mainImageWidths = root.findall(".//*[@type='mainImageWidth']") |
| 64 | #mainImageHeights = root.findall(".//*[@type='mainImageHeight']") |
| 65 | #mainImageThumbnailURLs = root.findall(".//*[@type='mainImageThumbnailURL']") |
lora-sp | 4201a5e | 2023-03-09 16:19:57 +0100 | [diff] [blame] | 66 | texts = [] |
lora-sp | ab4e0ea | 2023-03-10 12:02:24 +0100 | [diff] [blame] | 67 | |
| 68 | # Count text elements and remove metadata |
lora-sp | 4201a5e | 2023-03-09 16:19:57 +0100 | [diff] [blame] | 69 | number_of_texts = 0 |
| 70 | for text in root.iter("{http://www.tei-c.org/ns/1.0}text"): |
| 71 | number_of_texts+=1 |
| 72 | for body in text: |
| 73 | for div1 in body: |
| 74 | for div2 in div1: |
| 75 | for div3 in div2: |
| 76 | if div3.get('type') == "metadata": |
| 77 | div2.remove(div3) |
lora-sp | 8251163 | 2023-03-09 09:12:17 +0100 | [diff] [blame] | 78 | |
lora-sp | 4201a5e | 2023-03-09 16:19:57 +0100 | [diff] [blame] | 79 | texts.append(div2) |
lora-sp | ab4e0ea | 2023-03-10 12:02:24 +0100 | [diff] [blame] | 80 | |
| 81 | # Remove all elements from root |
lora-sp | 4201a5e | 2023-03-09 16:19:57 +0100 | [diff] [blame] | 82 | for elem in root.findall("*"): |
| 83 | root.remove(elem) |
lora-sp | ab4e0ea | 2023-03-10 12:02:24 +0100 | [diff] [blame] | 84 | |
| 85 | # Rename root |
lora-sp | 4201a5e | 2023-03-09 16:19:57 +0100 | [diff] [blame] | 86 | root.tag = "teiDoc" |
lora-sp | ab4e0ea | 2023-03-10 12:02:24 +0100 | [diff] [blame] | 87 | |
| 88 | # Create target structure |
lora-sp | 4201a5e | 2023-03-09 16:19:57 +0100 | [diff] [blame] | 89 | for i in range(number_of_texts): |
| 90 | tei = ET.SubElement(root, "TEI") |
| 91 | teiHeader = ET.SubElement(tei, "teiHeader") |
| 92 | fileDesc = ET.SubElement(teiHeader, "fileDesc") |
| 93 | titleStmt = ET.SubElement(fileDesc, "titleStmt") |
| 94 | textSigle = ET.SubElement(titleStmt, "textSigle") |
lora-sp | ab4e0ea | 2023-03-10 12:02:24 +0100 | [diff] [blame] | 95 | textSigle.text = "BNC/" + f"{j:03}" + "." + f"{i:05}" |
lora-sp | 4201a5e | 2023-03-09 16:19:57 +0100 | [diff] [blame] | 96 | sourceDesc = ET.SubElement(fileDesc, "sourceDesc") |
| 97 | analytic = ET.SubElement(sourceDesc, "analytic") |
| 98 | htitle = ET.SubElement(analytic, "h.title") |
| 99 | htitle.text = titles[i].text |
| 100 | hauthor = ET.SubElement(analytic, "h.author") |
| 101 | hauthor.text = authors[i].text |
| 102 | imprint = ET.SubElement(sourceDesc, "imprint") |
| 103 | pubDateYear = ET.SubElement(imprint, "pubDate") |
| 104 | pubDateYear.set("type", "year") |
| 105 | pubDateYear.text = datesPublished[i].text[0:4] |
| 106 | pubDateMonth = ET.SubElement(imprint, "pubDate") |
| 107 | pubDateMonth.set("type", "month") |
| 108 | pubDateMonth.text = datesPublished[i].text[5:7] |
| 109 | pubDateDay = ET.SubElement(imprint, "pubDate") |
| 110 | pubDateDay.set("type", "day") |
| 111 | pubDateDay.text = datesPublished[i].text[8:10] |
| 112 | pubDateTime = ET.SubElement(imprint, "pubDate") |
| 113 | pubDateTime.set("type", "time") |
| 114 | pubDateTime.text = timesPublished[i].text |
| 115 | pubPlace = ET.SubElement(imprint, "pubPlace") |
| 116 | ref = ET.SubElement(pubPlace, "ref") |
| 117 | ref.set("type", "page_url") |
| 118 | ref.set("target", pageURLs[i].text) |
| 119 | text = ET.SubElement(tei, "text") |
| 120 | body = ET.SubElement(text, "body") |
| 121 | for p in texts[i]: |
| 122 | body.append(p) |
lora-sp | 8251163 | 2023-03-09 09:12:17 +0100 | [diff] [blame] | 123 | |
lora-sp | 09a58a0 | 2023-03-10 16:33:46 +0100 | [diff] [blame] | 124 | return tree |
lora-sp | 132c3e5 | 2023-03-09 16:32:37 +0100 | [diff] [blame] | 125 | |
lora-sp | 132c3e5 | 2023-03-09 16:32:37 +0100 | [diff] [blame] | 126 | |
lora-sp | 09a58a0 | 2023-03-10 16:33:46 +0100 | [diff] [blame] | 127 | main() |