metadata
diff --git a/bunc2tei.py b/bunc2tei.py
index 4c30ba7..6c5d54c 100644
--- a/bunc2tei.py
+++ b/bunc2tei.py
@@ -1,11 +1,90 @@
 import os
 import xml.etree.ElementTree as ET
 
-path = 'home/spassova/BGCorpusExamples'
+path = "/home/spassova/BGCorpusExamples"
 files = os.listdir(path)
 
-tree = ET.parse(path + '/' + files[0])
+tree = ET.parse(path + "/" + files[0])
 root = tree.getroot()
 
+titles = root.findall(".//*[@type='title']")
+domains = root.findall(".//*[@type='domain']")
+pageURLs = root.findall(".//*[@type='pageURL']")
+ids = root.findall(".//*[@type='id']")
+mainImageURLs = root.findall(".//*[@type='mainImageURL']")
+mainImageTexts = root.findall(".//*[@type='mainImageTexts']")
+mainImageSources = root.findall(".//*[@type='mainImageSources']")
+authors = root.findall(".//*[@type='authors']")
+authorURLs = root.findall(".//*[@type='authorURLs']")
+categories = root.findall(".//*[@type='category']")
+subCategories = root.findall(".//*[@type='subCategory']")
+tags = root.findall(".//*[@type='tags']")
+datesPublished = root.findall(".//*[@type='datePublished']")
+timesPublished = root.findall(".//*[@type='timePublished']")
+datesModified = root.findall(".//*[@type='dateModified']")
+timesModified = root.findall(".//*[@type='timeModified']")
+mainImageWidths = root.findall(".//*[@type='mainImageWidth']")
+mainImageHeights = root.findall(".//*[@type='mainImageHeight']")
+mainImageThumbnailURLs = root.findall(".//*[@type='mainImageThumbnailURL']")
+texts = []
+
+number_of_texts = 0
+for text in root.iter("{http://www.tei-c.org/ns/1.0}text"):
+    number_of_texts+=1
+    for body in text:
+        for div1 in body:
+            for div2 in div1:
+                for div3 in div2:
+                    if div3.get('type') == "metadata":
+                        div2.remove(div3)
+
+                    texts.append(div2)
+                    
+
+#number_of_texts = len(root.findall("{http://www.tei-c.org/ns/1.0}text"))
+
+for elem in root.findall("*"):
+    root.remove(elem)
+
+root.tag = "teiCorpus"
+
+for i in range(number_of_texts):
+    tei = ET.SubElement(root, "TEI")
+    teiHeader = ET.SubElement(tei, "teiHeader")
+    fileDesc = ET.SubElement(teiHeader, "fileDesc")
+    titleStmt = ET.SubElement(fileDesc, "titleStmt")
+    textSigle = ET.SubElement(titleStmt, "textSigle")
+    textSigle.text = "BNC/TST." + f"{i:05}"
+    sourceDesc = ET.SubElement(fileDesc, "sourceDesc")
+    analytic = ET.SubElement(sourceDesc, "analytic")
+    htitle = ET.SubElement(analytic, "h.title")
+    htitle.text = titles[i].text
+    hauthor = ET.SubElement(analytic, "h.author")
+    hauthor.text = authors[i].text
+    imprint = ET.SubElement(sourceDesc, "imprint")
+    pubDateYear = ET.SubElement(imprint, "pubDate")
+    pubDateYear.set("type", "year")
+    pubDateYear.text = datesPublished[i].text[0:4]
+    pubDateMonth = ET.SubElement(imprint, "pubDate")
+    pubDateMonth.set("type", "month")
+    pubDateMonth.text = datesPublished[i].text[5:7]
+    pubDateDay = ET.SubElement(imprint, "pubDate")
+    pubDateDay.set("type", "day")
+    pubDateDay.text = datesPublished[i].text[8:10]
+    pubDateTime = ET.SubElement(imprint, "pubDate")
+    pubDateTime.set("type", "time")
+    pubDateTime.text = timesPublished[i].text
 
 
+
+
+
+
+    
+
+
+
+
+ET.indent(tree, "  ")
+ET.register_namespace("", "http://www.tei-c.org/ns/1.0")
+tree.write("04_output.xml", encoding="utf-8", xml_declaration=True, method="xml", short_empty_elements=True)
\ No newline at end of file