blob: 6c5d54c3c87c7990b827e02d14dee920068dadf3 [file] [log] [blame]
lora-sp66978642023-03-08 11:02:52 +01001import os
2import xml.etree.ElementTree as ET
3
lora-sp82511632023-03-09 09:12:17 +01004path = "/home/spassova/BGCorpusExamples"
lora-sp66978642023-03-08 11:02:52 +01005files = os.listdir(path)
6
lora-sp82511632023-03-09 09:12:17 +01007tree = ET.parse(path + "/" + files[0])
lora-sp66978642023-03-08 11:02:52 +01008root = tree.getroot()
9
lora-sp82511632023-03-09 09:12:17 +010010titles = root.findall(".//*[@type='title']")
11domains = root.findall(".//*[@type='domain']")
12pageURLs = root.findall(".//*[@type='pageURL']")
13ids = root.findall(".//*[@type='id']")
14mainImageURLs = root.findall(".//*[@type='mainImageURL']")
15mainImageTexts = root.findall(".//*[@type='mainImageTexts']")
16mainImageSources = root.findall(".//*[@type='mainImageSources']")
17authors = root.findall(".//*[@type='authors']")
18authorURLs = root.findall(".//*[@type='authorURLs']")
19categories = root.findall(".//*[@type='category']")
20subCategories = root.findall(".//*[@type='subCategory']")
21tags = root.findall(".//*[@type='tags']")
22datesPublished = root.findall(".//*[@type='datePublished']")
23timesPublished = root.findall(".//*[@type='timePublished']")
24datesModified = root.findall(".//*[@type='dateModified']")
25timesModified = root.findall(".//*[@type='timeModified']")
26mainImageWidths = root.findall(".//*[@type='mainImageWidth']")
27mainImageHeights = root.findall(".//*[@type='mainImageHeight']")
28mainImageThumbnailURLs = root.findall(".//*[@type='mainImageThumbnailURL']")
29texts = []
30
31number_of_texts = 0
32for text in root.iter("{http://www.tei-c.org/ns/1.0}text"):
33 number_of_texts+=1
34 for body in text:
35 for div1 in body:
36 for div2 in div1:
37 for div3 in div2:
38 if div3.get('type') == "metadata":
39 div2.remove(div3)
40
41 texts.append(div2)
42
43
44#number_of_texts = len(root.findall("{http://www.tei-c.org/ns/1.0}text"))
45
46for elem in root.findall("*"):
47 root.remove(elem)
48
49root.tag = "teiCorpus"
50
51for i in range(number_of_texts):
52 tei = ET.SubElement(root, "TEI")
53 teiHeader = ET.SubElement(tei, "teiHeader")
54 fileDesc = ET.SubElement(teiHeader, "fileDesc")
55 titleStmt = ET.SubElement(fileDesc, "titleStmt")
56 textSigle = ET.SubElement(titleStmt, "textSigle")
57 textSigle.text = "BNC/TST." + f"{i:05}"
58 sourceDesc = ET.SubElement(fileDesc, "sourceDesc")
59 analytic = ET.SubElement(sourceDesc, "analytic")
60 htitle = ET.SubElement(analytic, "h.title")
61 htitle.text = titles[i].text
62 hauthor = ET.SubElement(analytic, "h.author")
63 hauthor.text = authors[i].text
64 imprint = ET.SubElement(sourceDesc, "imprint")
65 pubDateYear = ET.SubElement(imprint, "pubDate")
66 pubDateYear.set("type", "year")
67 pubDateYear.text = datesPublished[i].text[0:4]
68 pubDateMonth = ET.SubElement(imprint, "pubDate")
69 pubDateMonth.set("type", "month")
70 pubDateMonth.text = datesPublished[i].text[5:7]
71 pubDateDay = ET.SubElement(imprint, "pubDate")
72 pubDateDay.set("type", "day")
73 pubDateDay.text = datesPublished[i].text[8:10]
74 pubDateTime = ET.SubElement(imprint, "pubDate")
75 pubDateTime.set("type", "time")
76 pubDateTime.text = timesPublished[i].text
lora-sp66978642023-03-08 11:02:52 +010077
78
lora-sp82511632023-03-09 09:12:17 +010079
80
81
82
83
84
85
86
87
88ET.indent(tree, " ")
89ET.register_namespace("", "http://www.tei-c.org/ns/1.0")
90tree.write("04_output.xml", encoding="utf-8", xml_declaration=True, method="xml", short_empty_elements=True)