blob: 9870285bf355151604c3b081ec79b042799fc586 [file] [log] [blame]
lora-sp66978642023-03-08 11:02:52 +01001import os
2import xml.etree.ElementTree as ET
3
lora-sp82511632023-03-09 09:12:17 +01004path = "/home/spassova/BGCorpusExamples"
lora-sp66978642023-03-08 11:02:52 +01005files = os.listdir(path)
6
lora-sp82511632023-03-09 09:12:17 +01007tree = ET.parse(path + "/" + files[0])
lora-sp66978642023-03-08 11:02:52 +01008root = tree.getroot()
9
lora-sp82511632023-03-09 09:12:17 +010010titles = root.findall(".//*[@type='title']")
11domains = root.findall(".//*[@type='domain']")
12pageURLs = root.findall(".//*[@type='pageURL']")
13ids = root.findall(".//*[@type='id']")
14mainImageURLs = root.findall(".//*[@type='mainImageURL']")
15mainImageTexts = root.findall(".//*[@type='mainImageTexts']")
16mainImageSources = root.findall(".//*[@type='mainImageSources']")
17authors = root.findall(".//*[@type='authors']")
18authorURLs = root.findall(".//*[@type='authorURLs']")
19categories = root.findall(".//*[@type='category']")
20subCategories = root.findall(".//*[@type='subCategory']")
21tags = root.findall(".//*[@type='tags']")
22datesPublished = root.findall(".//*[@type='datePublished']")
23timesPublished = root.findall(".//*[@type='timePublished']")
24datesModified = root.findall(".//*[@type='dateModified']")
25timesModified = root.findall(".//*[@type='timeModified']")
26mainImageWidths = root.findall(".//*[@type='mainImageWidth']")
27mainImageHeights = root.findall(".//*[@type='mainImageHeight']")
28mainImageThumbnailURLs = root.findall(".//*[@type='mainImageThumbnailURL']")
29texts = []
30
31number_of_texts = 0
32for text in root.iter("{http://www.tei-c.org/ns/1.0}text"):
33 number_of_texts+=1
34 for body in text:
35 for div1 in body:
36 for div2 in div1:
37 for div3 in div2:
38 if div3.get('type') == "metadata":
39 div2.remove(div3)
40
41 texts.append(div2)
42
43
44#number_of_texts = len(root.findall("{http://www.tei-c.org/ns/1.0}text"))
45
46for elem in root.findall("*"):
47 root.remove(elem)
48
49root.tag = "teiCorpus"
50
51for i in range(number_of_texts):
52 tei = ET.SubElement(root, "TEI")
53 teiHeader = ET.SubElement(tei, "teiHeader")
54 fileDesc = ET.SubElement(teiHeader, "fileDesc")
55 titleStmt = ET.SubElement(fileDesc, "titleStmt")
56 textSigle = ET.SubElement(titleStmt, "textSigle")
57 textSigle.text = "BNC/TST." + f"{i:05}"
58 sourceDesc = ET.SubElement(fileDesc, "sourceDesc")
59 analytic = ET.SubElement(sourceDesc, "analytic")
60 htitle = ET.SubElement(analytic, "h.title")
61 htitle.text = titles[i].text
62 hauthor = ET.SubElement(analytic, "h.author")
63 hauthor.text = authors[i].text
64 imprint = ET.SubElement(sourceDesc, "imprint")
65 pubDateYear = ET.SubElement(imprint, "pubDate")
66 pubDateYear.set("type", "year")
67 pubDateYear.text = datesPublished[i].text[0:4]
68 pubDateMonth = ET.SubElement(imprint, "pubDate")
69 pubDateMonth.set("type", "month")
70 pubDateMonth.text = datesPublished[i].text[5:7]
71 pubDateDay = ET.SubElement(imprint, "pubDate")
72 pubDateDay.set("type", "day")
73 pubDateDay.text = datesPublished[i].text[8:10]
74 pubDateTime = ET.SubElement(imprint, "pubDate")
75 pubDateTime.set("type", "time")
76 pubDateTime.text = timesPublished[i].text
lora-sp1ffc87a2023-03-09 09:28:59 +010077 pubPlace = ET.SubElement(imprint, "pubPlace")
78 ref = ET.SubElement(pubPlace, "ref")
79 ref.set("type", "page_url")
80 ref.set("target", pageURLs[i].text)
lora-sp66978642023-03-08 11:02:52 +010081
82
lora-sp82511632023-03-09 09:12:17 +010083
84
85
lora-sp82511632023-03-09 09:12:17 +010086ET.indent(tree, " ")
87ET.register_namespace("", "http://www.tei-c.org/ns/1.0")
88tree.write("04_output.xml", encoding="utf-8", xml_declaration=True, method="xml", short_empty_elements=True)