blob: a8dbe9d8f84e1659d71982c2f0354bf7c3653a2e [file] [log] [blame]
lora-sp66978642023-03-08 11:02:52 +01001import os
2import xml.etree.ElementTree as ET
3
lora-spde81dab2023-03-09 10:51:33 +01004<<<<<<< HEAD
lora-spd9b00682023-03-09 10:18:20 +01005# Path to documents
lora-sp82511632023-03-09 09:12:17 +01006path = "/home/spassova/BGCorpusExamples"
lora-sp66978642023-03-08 11:02:52 +01007files = os.listdir(path)
8
lora-spd9b00682023-03-09 10:18:20 +01009# Parse tree and get root
lora-sp82511632023-03-09 09:12:17 +010010tree = ET.parse(path + "/" + files[0])
lora-sp66978642023-03-08 11:02:52 +010011root = tree.getroot()
12
lora-spd9b00682023-03-09 10:18:20 +010013# Store metadata and texts in lists
lora-sp82511632023-03-09 09:12:17 +010014titles = root.findall(".//*[@type='title']")
15domains = root.findall(".//*[@type='domain']")
16pageURLs = root.findall(".//*[@type='pageURL']")
17ids = root.findall(".//*[@type='id']")
18mainImageURLs = root.findall(".//*[@type='mainImageURL']")
19mainImageTexts = root.findall(".//*[@type='mainImageTexts']")
20mainImageSources = root.findall(".//*[@type='mainImageSources']")
21authors = root.findall(".//*[@type='authors']")
22authorURLs = root.findall(".//*[@type='authorURLs']")
23categories = root.findall(".//*[@type='category']")
24subCategories = root.findall(".//*[@type='subCategory']")
25tags = root.findall(".//*[@type='tags']")
26datesPublished = root.findall(".//*[@type='datePublished']")
27timesPublished = root.findall(".//*[@type='timePublished']")
28datesModified = root.findall(".//*[@type='dateModified']")
29timesModified = root.findall(".//*[@type='timeModified']")
30mainImageWidths = root.findall(".//*[@type='mainImageWidth']")
31mainImageHeights = root.findall(".//*[@type='mainImageHeight']")
32mainImageThumbnailURLs = root.findall(".//*[@type='mainImageThumbnailURL']")
33texts = []
34
lora-spd9b00682023-03-09 10:18:20 +010035# Count text elements and remove metadata
lora-sp82511632023-03-09 09:12:17 +010036number_of_texts = 0
37for text in root.iter("{http://www.tei-c.org/ns/1.0}text"):
38 number_of_texts+=1
39 for body in text:
40 for div1 in body:
41 for div2 in div1:
42 for div3 in div2:
43 if div3.get('type') == "metadata":
44 div2.remove(div3)
45
46 texts.append(div2)
47
48
lora-spd9b00682023-03-09 10:18:20 +010049# Remove all elements from root
lora-sp82511632023-03-09 09:12:17 +010050for elem in root.findall("*"):
51 root.remove(elem)
52
lora-spd9b00682023-03-09 10:18:20 +010053# Rename root
lora-sp82511632023-03-09 09:12:17 +010054root.tag = "teiCorpus"
55
lora-spd9b00682023-03-09 10:18:20 +010056# Create i5 structure
lora-sp82511632023-03-09 09:12:17 +010057for i in range(number_of_texts):
58 tei = ET.SubElement(root, "TEI")
59 teiHeader = ET.SubElement(tei, "teiHeader")
60 fileDesc = ET.SubElement(teiHeader, "fileDesc")
61 titleStmt = ET.SubElement(fileDesc, "titleStmt")
62 textSigle = ET.SubElement(titleStmt, "textSigle")
63 textSigle.text = "BNC/TST." + f"{i:05}"
64 sourceDesc = ET.SubElement(fileDesc, "sourceDesc")
65 analytic = ET.SubElement(sourceDesc, "analytic")
66 htitle = ET.SubElement(analytic, "h.title")
67 htitle.text = titles[i].text
68 hauthor = ET.SubElement(analytic, "h.author")
69 hauthor.text = authors[i].text
70 imprint = ET.SubElement(sourceDesc, "imprint")
71 pubDateYear = ET.SubElement(imprint, "pubDate")
72 pubDateYear.set("type", "year")
73 pubDateYear.text = datesPublished[i].text[0:4]
74 pubDateMonth = ET.SubElement(imprint, "pubDate")
75 pubDateMonth.set("type", "month")
76 pubDateMonth.text = datesPublished[i].text[5:7]
77 pubDateDay = ET.SubElement(imprint, "pubDate")
78 pubDateDay.set("type", "day")
79 pubDateDay.text = datesPublished[i].text[8:10]
80 pubDateTime = ET.SubElement(imprint, "pubDate")
81 pubDateTime.set("type", "time")
82 pubDateTime.text = timesPublished[i].text
lora-sp1ffc87a2023-03-09 09:28:59 +010083 pubPlace = ET.SubElement(imprint, "pubPlace")
84 ref = ET.SubElement(pubPlace, "ref")
85 ref.set("type", "page_url")
86 ref.set("target", pageURLs[i].text)
lora-spd9b00682023-03-09 10:18:20 +010087 text = ET.SubElement(tei, "text")
88 body = ET.SubElement(text, "body")
89 for p in texts[i]:
90 body.append(p)
lora-spde81dab2023-03-09 10:51:33 +010091=======
Lora Spassova9dde8ef2023-03-09 08:20:41 +010092path = '/home/spassova/BGCorpusExamples'
93files = os.listdir(path)
94
95tree = ET.parse(path + '/' + files[0])
96root = tree.getroot()
97
98titles = root.findall('.//*[@type='title']')
99domains = root.findall('.//*[@type='domain']')
100pageURLs = root.findall('.//*[@type='pageURL']')
101ids = root.findall('.//*[@type='id']')
102mainImageURLs = root.findall('.//*[@type='mainImageURL']')
103mainImageTexts = root.findall('.//*[@type='mainImageTexts']')
104mainImageSources = root.findall('.//*[@type='mainImageSources']')
105authors = root.findall('.//*[@type='authors']')
106authorURLs = root.findall('.//*[@type='authorURLs']')
107categories = root.findall('.//*[@type='category']')
108subCategories = root.findall('.//*[@type='subCategory']')
109tags = root.findall('.//*[@type='tags']')
110datesPublished = root.findall('.//*[@type='datePublished']')
111timesPublished = root.findall('.//*[@type='timePubished']')
112datesModified = root.findall('.//*[@type='dateModified']')
113timesModified = root.findall('.//*[@type='timeModified']')
114
115number_of_texts = len(root.findall('{http://www.tei-c.org/ns/1.0}text'))
116
117for elem in root.findall('*'):
118 root.remove(elem)
119
120root.tag = 'teiCorpus'
121
122for i in range(number_of_texts):
123 tei = ET.SubElement(root, 'TEI')
124 teiHeader = ET.SubElement(tei, 'teiHeader')
125 fileDesc = ET.SubElement(teiHeader, 'fileDesc')
126 titleStmt = ET.SubElement(fileDesc, 'titleStmt')
127 textSigle = ET.SubElement(titleStmt, 'textSigle')
128 textSigle.text = 'BNC/TST.' + f'{i:05}'
129 sourceDesc = ET.SubElement(fileDesc, 'sourceDesc')
130
131
132
133
lora-spde81dab2023-03-09 10:51:33 +0100134>>>>>>> refs/remotes/origin/main
lora-sp66978642023-03-08 11:02:52 +0100135
lora-sp82511632023-03-09 09:12:17 +0100136
137
138
lora-sp82511632023-03-09 09:12:17 +0100139ET.indent(tree, " ")
140ET.register_namespace("", "http://www.tei-c.org/ns/1.0")
lora-spde81dab2023-03-09 10:51:33 +0100141tree.write("04_output.xml", encoding="utf-8", xml_declaration=True, method="xml", short_empty_elements=True)