lora-sp | 6697864 | 2023-03-08 11:02:52 +0100 | [diff] [blame] | 1 | import os |
| 2 | import xml.etree.ElementTree as ET |
| 3 | |
lora-sp | de81dab | 2023-03-09 10:51:33 +0100 | [diff] [blame] | 4 | <<<<<<< HEAD |
lora-sp | d9b0068 | 2023-03-09 10:18:20 +0100 | [diff] [blame] | 5 | # Path to documents |
lora-sp | 8251163 | 2023-03-09 09:12:17 +0100 | [diff] [blame] | 6 | path = "/home/spassova/BGCorpusExamples" |
lora-sp | 6697864 | 2023-03-08 11:02:52 +0100 | [diff] [blame] | 7 | files = os.listdir(path) |
| 8 | |
lora-sp | d9b0068 | 2023-03-09 10:18:20 +0100 | [diff] [blame] | 9 | # Parse tree and get root |
lora-sp | 8251163 | 2023-03-09 09:12:17 +0100 | [diff] [blame] | 10 | tree = ET.parse(path + "/" + files[0]) |
lora-sp | 6697864 | 2023-03-08 11:02:52 +0100 | [diff] [blame] | 11 | root = tree.getroot() |
| 12 | |
lora-sp | d9b0068 | 2023-03-09 10:18:20 +0100 | [diff] [blame] | 13 | # Store metadata and texts in lists |
lora-sp | 8251163 | 2023-03-09 09:12:17 +0100 | [diff] [blame] | 14 | titles = root.findall(".//*[@type='title']") |
| 15 | domains = root.findall(".//*[@type='domain']") |
| 16 | pageURLs = root.findall(".//*[@type='pageURL']") |
| 17 | ids = root.findall(".//*[@type='id']") |
| 18 | mainImageURLs = root.findall(".//*[@type='mainImageURL']") |
| 19 | mainImageTexts = root.findall(".//*[@type='mainImageTexts']") |
| 20 | mainImageSources = root.findall(".//*[@type='mainImageSources']") |
| 21 | authors = root.findall(".//*[@type='authors']") |
| 22 | authorURLs = root.findall(".//*[@type='authorURLs']") |
| 23 | categories = root.findall(".//*[@type='category']") |
| 24 | subCategories = root.findall(".//*[@type='subCategory']") |
| 25 | tags = root.findall(".//*[@type='tags']") |
| 26 | datesPublished = root.findall(".//*[@type='datePublished']") |
| 27 | timesPublished = root.findall(".//*[@type='timePublished']") |
| 28 | datesModified = root.findall(".//*[@type='dateModified']") |
| 29 | timesModified = root.findall(".//*[@type='timeModified']") |
| 30 | mainImageWidths = root.findall(".//*[@type='mainImageWidth']") |
| 31 | mainImageHeights = root.findall(".//*[@type='mainImageHeight']") |
| 32 | mainImageThumbnailURLs = root.findall(".//*[@type='mainImageThumbnailURL']") |
| 33 | texts = [] |
| 34 | |
lora-sp | d9b0068 | 2023-03-09 10:18:20 +0100 | [diff] [blame] | 35 | # Count text elements and remove metadata |
lora-sp | 8251163 | 2023-03-09 09:12:17 +0100 | [diff] [blame] | 36 | number_of_texts = 0 |
| 37 | for text in root.iter("{http://www.tei-c.org/ns/1.0}text"): |
| 38 | number_of_texts+=1 |
| 39 | for body in text: |
| 40 | for div1 in body: |
| 41 | for div2 in div1: |
| 42 | for div3 in div2: |
| 43 | if div3.get('type') == "metadata": |
| 44 | div2.remove(div3) |
| 45 | |
| 46 | texts.append(div2) |
| 47 | |
| 48 | |
lora-sp | d9b0068 | 2023-03-09 10:18:20 +0100 | [diff] [blame] | 49 | # Remove all elements from root |
lora-sp | 8251163 | 2023-03-09 09:12:17 +0100 | [diff] [blame] | 50 | for elem in root.findall("*"): |
| 51 | root.remove(elem) |
| 52 | |
lora-sp | d9b0068 | 2023-03-09 10:18:20 +0100 | [diff] [blame] | 53 | # Rename root |
lora-sp | 8251163 | 2023-03-09 09:12:17 +0100 | [diff] [blame] | 54 | root.tag = "teiCorpus" |
| 55 | |
lora-sp | d9b0068 | 2023-03-09 10:18:20 +0100 | [diff] [blame] | 56 | # Create i5 structure |
lora-sp | 8251163 | 2023-03-09 09:12:17 +0100 | [diff] [blame] | 57 | for i in range(number_of_texts): |
| 58 | tei = ET.SubElement(root, "TEI") |
| 59 | teiHeader = ET.SubElement(tei, "teiHeader") |
| 60 | fileDesc = ET.SubElement(teiHeader, "fileDesc") |
| 61 | titleStmt = ET.SubElement(fileDesc, "titleStmt") |
| 62 | textSigle = ET.SubElement(titleStmt, "textSigle") |
| 63 | textSigle.text = "BNC/TST." + f"{i:05}" |
| 64 | sourceDesc = ET.SubElement(fileDesc, "sourceDesc") |
| 65 | analytic = ET.SubElement(sourceDesc, "analytic") |
| 66 | htitle = ET.SubElement(analytic, "h.title") |
| 67 | htitle.text = titles[i].text |
| 68 | hauthor = ET.SubElement(analytic, "h.author") |
| 69 | hauthor.text = authors[i].text |
| 70 | imprint = ET.SubElement(sourceDesc, "imprint") |
| 71 | pubDateYear = ET.SubElement(imprint, "pubDate") |
| 72 | pubDateYear.set("type", "year") |
| 73 | pubDateYear.text = datesPublished[i].text[0:4] |
| 74 | pubDateMonth = ET.SubElement(imprint, "pubDate") |
| 75 | pubDateMonth.set("type", "month") |
| 76 | pubDateMonth.text = datesPublished[i].text[5:7] |
| 77 | pubDateDay = ET.SubElement(imprint, "pubDate") |
| 78 | pubDateDay.set("type", "day") |
| 79 | pubDateDay.text = datesPublished[i].text[8:10] |
| 80 | pubDateTime = ET.SubElement(imprint, "pubDate") |
| 81 | pubDateTime.set("type", "time") |
| 82 | pubDateTime.text = timesPublished[i].text |
lora-sp | 1ffc87a | 2023-03-09 09:28:59 +0100 | [diff] [blame] | 83 | pubPlace = ET.SubElement(imprint, "pubPlace") |
| 84 | ref = ET.SubElement(pubPlace, "ref") |
| 85 | ref.set("type", "page_url") |
| 86 | ref.set("target", pageURLs[i].text) |
lora-sp | d9b0068 | 2023-03-09 10:18:20 +0100 | [diff] [blame] | 87 | text = ET.SubElement(tei, "text") |
| 88 | body = ET.SubElement(text, "body") |
| 89 | for p in texts[i]: |
| 90 | body.append(p) |
lora-sp | de81dab | 2023-03-09 10:51:33 +0100 | [diff] [blame] | 91 | ======= |
Lora Spassova | 9dde8ef | 2023-03-09 08:20:41 +0100 | [diff] [blame] | 92 | path = '/home/spassova/BGCorpusExamples' |
| 93 | files = os.listdir(path) |
| 94 | |
| 95 | tree = ET.parse(path + '/' + files[0]) |
| 96 | root = tree.getroot() |
| 97 | |
| 98 | titles = root.findall('.//*[@type='title']') |
| 99 | domains = root.findall('.//*[@type='domain']') |
| 100 | pageURLs = root.findall('.//*[@type='pageURL']') |
| 101 | ids = root.findall('.//*[@type='id']') |
| 102 | mainImageURLs = root.findall('.//*[@type='mainImageURL']') |
| 103 | mainImageTexts = root.findall('.//*[@type='mainImageTexts']') |
| 104 | mainImageSources = root.findall('.//*[@type='mainImageSources']') |
| 105 | authors = root.findall('.//*[@type='authors']') |
| 106 | authorURLs = root.findall('.//*[@type='authorURLs']') |
| 107 | categories = root.findall('.//*[@type='category']') |
| 108 | subCategories = root.findall('.//*[@type='subCategory']') |
| 109 | tags = root.findall('.//*[@type='tags']') |
| 110 | datesPublished = root.findall('.//*[@type='datePublished']') |
| 111 | timesPublished = root.findall('.//*[@type='timePubished']') |
| 112 | datesModified = root.findall('.//*[@type='dateModified']') |
| 113 | timesModified = root.findall('.//*[@type='timeModified']') |
| 114 | |
| 115 | number_of_texts = len(root.findall('{http://www.tei-c.org/ns/1.0}text')) |
| 116 | |
| 117 | for elem in root.findall('*'): |
| 118 | root.remove(elem) |
| 119 | |
| 120 | root.tag = 'teiCorpus' |
| 121 | |
| 122 | for i in range(number_of_texts): |
| 123 | tei = ET.SubElement(root, 'TEI') |
| 124 | teiHeader = ET.SubElement(tei, 'teiHeader') |
| 125 | fileDesc = ET.SubElement(teiHeader, 'fileDesc') |
| 126 | titleStmt = ET.SubElement(fileDesc, 'titleStmt') |
| 127 | textSigle = ET.SubElement(titleStmt, 'textSigle') |
| 128 | textSigle.text = 'BNC/TST.' + f'{i:05}' |
| 129 | sourceDesc = ET.SubElement(fileDesc, 'sourceDesc') |
| 130 | |
| 131 | |
| 132 | |
| 133 | |
lora-sp | de81dab | 2023-03-09 10:51:33 +0100 | [diff] [blame] | 134 | >>>>>>> refs/remotes/origin/main |
lora-sp | 6697864 | 2023-03-08 11:02:52 +0100 | [diff] [blame] | 135 | |
lora-sp | 8251163 | 2023-03-09 09:12:17 +0100 | [diff] [blame] | 136 | |
| 137 | |
| 138 | |
lora-sp | 8251163 | 2023-03-09 09:12:17 +0100 | [diff] [blame] | 139 | ET.indent(tree, " ") |
| 140 | ET.register_namespace("", "http://www.tei-c.org/ns/1.0") |
lora-sp | de81dab | 2023-03-09 10:51:33 +0100 | [diff] [blame] | 141 | tree.write("04_output.xml", encoding="utf-8", xml_declaration=True, method="xml", short_empty_elements=True) |