blob: fb08ec55e00565e2ca4df4617b1c3e3cba315ba6 [file] [log] [blame]
lora-sp66978642023-03-08 11:02:52 +01001import os
2import xml.etree.ElementTree as ET
3
lora-spd9b00682023-03-09 10:18:20 +01004# Path to documents
lora-sp82511632023-03-09 09:12:17 +01005path = "/home/spassova/BGCorpusExamples"
lora-sp66978642023-03-08 11:02:52 +01006files = os.listdir(path)
7
lora-spd9b00682023-03-09 10:18:20 +01008# Parse tree and get root
lora-sp82511632023-03-09 09:12:17 +01009tree = ET.parse(path + "/" + files[0])
lora-sp66978642023-03-08 11:02:52 +010010root = tree.getroot()
11
lora-spd9b00682023-03-09 10:18:20 +010012# Store metadata and texts in lists
lora-sp82511632023-03-09 09:12:17 +010013titles = root.findall(".//*[@type='title']")
14domains = root.findall(".//*[@type='domain']")
15pageURLs = root.findall(".//*[@type='pageURL']")
16ids = root.findall(".//*[@type='id']")
17mainImageURLs = root.findall(".//*[@type='mainImageURL']")
18mainImageTexts = root.findall(".//*[@type='mainImageTexts']")
19mainImageSources = root.findall(".//*[@type='mainImageSources']")
20authors = root.findall(".//*[@type='authors']")
21authorURLs = root.findall(".//*[@type='authorURLs']")
22categories = root.findall(".//*[@type='category']")
23subCategories = root.findall(".//*[@type='subCategory']")
24tags = root.findall(".//*[@type='tags']")
25datesPublished = root.findall(".//*[@type='datePublished']")
26timesPublished = root.findall(".//*[@type='timePublished']")
27datesModified = root.findall(".//*[@type='dateModified']")
28timesModified = root.findall(".//*[@type='timeModified']")
29mainImageWidths = root.findall(".//*[@type='mainImageWidth']")
30mainImageHeights = root.findall(".//*[@type='mainImageHeight']")
31mainImageThumbnailURLs = root.findall(".//*[@type='mainImageThumbnailURL']")
32texts = []
33
lora-spd9b00682023-03-09 10:18:20 +010034# Count text elements and remove metadata
lora-sp82511632023-03-09 09:12:17 +010035number_of_texts = 0
36for text in root.iter("{http://www.tei-c.org/ns/1.0}text"):
37 number_of_texts+=1
38 for body in text:
39 for div1 in body:
40 for div2 in div1:
41 for div3 in div2:
42 if div3.get('type') == "metadata":
43 div2.remove(div3)
44
45 texts.append(div2)
46
47
lora-spd9b00682023-03-09 10:18:20 +010048# Remove all elements from root
lora-sp82511632023-03-09 09:12:17 +010049for elem in root.findall("*"):
50 root.remove(elem)
51
lora-spd9b00682023-03-09 10:18:20 +010052# Rename root
lora-sp82511632023-03-09 09:12:17 +010053root.tag = "teiCorpus"
54
lora-spd9b00682023-03-09 10:18:20 +010055# Create i5 structure
lora-sp82511632023-03-09 09:12:17 +010056for i in range(number_of_texts):
57 tei = ET.SubElement(root, "TEI")
58 teiHeader = ET.SubElement(tei, "teiHeader")
59 fileDesc = ET.SubElement(teiHeader, "fileDesc")
60 titleStmt = ET.SubElement(fileDesc, "titleStmt")
61 textSigle = ET.SubElement(titleStmt, "textSigle")
62 textSigle.text = "BNC/TST." + f"{i:05}"
63 sourceDesc = ET.SubElement(fileDesc, "sourceDesc")
64 analytic = ET.SubElement(sourceDesc, "analytic")
65 htitle = ET.SubElement(analytic, "h.title")
66 htitle.text = titles[i].text
67 hauthor = ET.SubElement(analytic, "h.author")
68 hauthor.text = authors[i].text
69 imprint = ET.SubElement(sourceDesc, "imprint")
70 pubDateYear = ET.SubElement(imprint, "pubDate")
71 pubDateYear.set("type", "year")
72 pubDateYear.text = datesPublished[i].text[0:4]
73 pubDateMonth = ET.SubElement(imprint, "pubDate")
74 pubDateMonth.set("type", "month")
75 pubDateMonth.text = datesPublished[i].text[5:7]
76 pubDateDay = ET.SubElement(imprint, "pubDate")
77 pubDateDay.set("type", "day")
78 pubDateDay.text = datesPublished[i].text[8:10]
79 pubDateTime = ET.SubElement(imprint, "pubDate")
80 pubDateTime.set("type", "time")
81 pubDateTime.text = timesPublished[i].text
lora-sp1ffc87a2023-03-09 09:28:59 +010082 pubPlace = ET.SubElement(imprint, "pubPlace")
83 ref = ET.SubElement(pubPlace, "ref")
84 ref.set("type", "page_url")
85 ref.set("target", pageURLs[i].text)
lora-spd9b00682023-03-09 10:18:20 +010086 text = ET.SubElement(tei, "text")
87 body = ET.SubElement(text, "body")
88 for p in texts[i]:
89 body.append(p)
lora-sp66978642023-03-08 11:02:52 +010090
lora-sp82511632023-03-09 09:12:17 +010091
92
93
lora-sp82511632023-03-09 09:12:17 +010094ET.indent(tree, " ")
95ET.register_namespace("", "http://www.tei-c.org/ns/1.0")
96tree.write("04_output.xml", encoding="utf-8", xml_declaration=True, method="xml", short_empty_elements=True)